In [3]:
import numpy as np
import glob
from gensim.models import word2vec
import gensim
import re
import logging

In [4]:
def cleanText(text):
	#assume all reveiws written in english, delete all non-ascii char
	text = text.encode('ascii','ignore').decode()
	
	#delete HTML tag
	text = re.sub(r'</?\w+[^>]*>','',text)
	
	#delete punctuation except char'char case(e.g. "haven't","can't","macy's")
	text = re.sub(" '|'\W|[-(),.\"!?#*$~`\{\}\[\]/+&*=:^]", " ", text)
		
	#transform several space into one space
	text = re.sub("\s+", " ", text)
		
	#transform all letters to lowercase
	text = text.lower().split()

	return text

In [5]:
def createInput(neg_path, pos_path):
	#
	#Gensim's word2vec input format is a list of lists, each list inside the list indicates a review. 
	#[['word1', 'word2', 'word3', '...'],['word1', 'word2', '...'], ..., ['...','...']]
	#

	print("Loading the imdb reviews data and clean the data")
	neg_files = glob.glob(neg_path + "/*.txt")
	pos_files = glob.glob(pos_path + "/*.txt")
	
	sentences = []
	
	for tnf in neg_files:
		f = open(tnf, 'r', errors='replace')

		line = f.read()

		#clean the data by delete punctuations and transform all uppercase to lowercase
		clean_line = cleanText(line)
		
		sentences.append(clean_line)
		
		f.close()
	
	for tpf in pos_files:
		f = open(tpf, 'r', errors='replace')
		line = f.read()
		clean_line = cleanText(line)
		sentences.append(clean_line)
		f.close()
	
	print("Data loaded and cleaned.")

	return sentences

In [6]:
def trainModel(sentences):
	#train word vector
	print("train word vector")
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
		level=logging.INFO)

	#set values for the parameters in Word2Vec
	print("parameters of training the model")
	print("dimension: word vector dimensionality")
	dimension = 200  
	print("min_count: any word that does not occur at least this many times across all documents is ignored")
	min_count = 5
	print("num_worders: number of threads to run in parallel")
	num_workers = 4
	print("window size")  
	window_size = 5
	#downsample setting for frequent words  
	downsampling = 1e-3  

	print("Training model")
	model = word2vec.Word2Vec(sentences, workers=num_workers,
							  size=dimension, min_count=min_count,
							  window=window_size, sample=downsampling, sg = 1)

	#
	#If finished training a model (no more updates, only querying), 
	#could do:
	# model.init_sims(replace=True)
	#to trim unneeded model memory = use (much) less RAM.
	#

	print("save the word vector model to disk")
	#specify path and model's name
	path = "/Users/ziluguo/Desktop/try/"
	fname = "wordVectorModel"
	model.save(path+fname)

	#train phrases model
	dimension = 200
	print("train phrases model, word vector's size is %d" %dimension)
	bigram_transformer = gensim.models.Phrases(sentences)
	model = word2vec.Word2Vec(bigram_transformer[sentences], workers=num_workers,
							  size=dimension, min_count=min_count,
							  window=window_size, sample=downsampling, sg = 1)

	print("save the phrases vector model to disk")
	#specify path and model's name
	path = "/Users/ziluguo/Desktop/try/"
	fname = "phrasesVectorModel"
	model.save(path+fname)

In [15]:
def explore(path):
	#load word vector model
	print("Load Word Vector Model")
	fname = "wordVectorModel"
	model = word2vec.Word2Vec.load(path+fname)
	
	#number of words, number of features
	print("show words vector's shape")
	print(model.syn0.shape)

	print("access individual word vector")
	print("for example, the word vector of 'best'")
	print(model["best"])

	print("show the word that is most dissimilar from the others")
	word_set = "best worst fine london"
	most_dissimilar = model.doesnt_match(word_set.split()) + '\n'
	print("for example, words are %s, the most dissimilar word is %s" %(word_set, most_dissimilar))

	print("show the word that is most dissimilar from the others")
	word_set = "best finest refreshing poorest"
	most_dissimilar = model.doesnt_match(word_set.split()) + '\n'
	print("for example, words are %s, the most dissimilar word is %s" %(word_set, most_dissimilar))

	print("show the word that is most dissimilar from the others")
	word_set = "enjoy apple juice orange"
	most_dissimilar = model.doesnt_match(word_set.split()) + '\n'
	print("for example, words are %s, the most dissimilar word is %s" %(word_set, most_dissimilar))	
    
	#show similarity
	print("Show most similar words and corresponding distance to the given words")
	#indicate positive words
	pos = ['best']
	#doesn't indicate nagative word indicate negative words
	print("for example, the given word is %s" %(pos))
	most_similar = model.most_similar(positive=pos)
	print(most_similar)

	#show similarity
	print("Show most similar words and corresponding distance to the given words")
	#indicate positive words
	pos = ['worst']
	#doesn't indicate nagative word indicate negative words
	print("for example, the given word is %s" %(pos))
	most_similar = model.most_similar(positive=pos)
	print(most_similar)
    
	#do more complex queries like analogies such as: king - man + woman = queen 
	print("Show most similar words and corresponding distance to the given analogies")
	#indicate positive words
	pos = ['best', 'refreshing']
	#indicate negative words
	neg = ['worst']
	print("for example, the given positive words are %s, the given negative word is %s" %(pos, neg))
	most_similar = model.most_similar(positive=pos, negative=neg)
	print(most_similar)
    
	#do more complex queries like analogies such as: king - man + woman = queen 
	print("Show most similar words and corresponding distance to the given analogies")
	#indicate positive words
	pos = ['worst', 'poorest']
	#indicate negative words
	neg = ['best']
	print("for example, the given positive words are %s, the given negative word is %s" %(pos, neg))
	most_similar = model.most_similar(positive=pos, negative=neg)
	print(most_similar)
    
	print("test model's accuracy here")
	#test the quality of the word vectors
	#The accuracy depends heavily on the amount of the training data.
	#read the evaluation file, get it at:
	#https://code.google.com/archive/p/word2vec/source/default/source
	questions = "questions-words.txt"
	evals = open(path+questions, 'r').readlines()
	num_sections = len([l for l in evals if l.startswith(':')])
	print('total test sentences: {} '.format(len(evals) - num_sections))
	#test accuracy of model
	accuracy = model.accuracy(path+questions)
	sum_corr = len(accuracy[-1]['correct'])
	sum_incorr = len(accuracy[-1]['incorrect'])
	total = sum_corr + sum_incorr
	percent = lambda a: a / total * 100
	print('Total evaluation sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, percent(sum_corr), percent(sum_incorr)))


In [16]:
#main
#specify the path of datasets
neg_path = "/Users/ziluguo/Desktop/try/aclImdb/train/neg"
pos_path = "/Users/ziluguo/Desktop/try/aclImdb/train/pos"

#use training data create input of word2vector
sentences = createInput(neg_path, pos_path)

#
#What word2vec does is to represent each word as a vector in a form 
#that reflects how close words occur. 
#The closer the words tend to occur in a text the more similar the vectors will be.
#

#use word2vec train model
trainModel(sentences)

#display what model can do
print("explore the model")
#model's path
path = "/Users/ziluguo/Desktop/try/"
explore(path)

explore the model
Load Word Vector Model
show words vector's shape
(30321, 200)
access individual word vector
for example, the word vector of 'best'
[ 0.13880603  0.0178613  -0.02573948 -0.14831917 -0.15835425  0.16883919
  0.11967301  0.17873153  0.12080888  0.18553962 -0.03115853 -0.0314556
 -0.15939252 -0.07899953 -0.12812638  0.05657555 -0.32958868 -0.0049634
  0.54324406  0.26949298 -0.42546371  0.49630183  0.01495651 -0.05225568
 -0.09862857  0.21544912 -0.28073025 -0.10606798 -0.23924465  0.04499689
  0.28453296  0.0578398  -0.52040863 -0.500314    0.26657689 -0.36671656
 -0.02926834  0.53173608  0.43211129 -0.10846587  0.10721705  0.18494959
 -0.04721029 -0.143896   -0.18683045  0.07056804  0.08171466  0.271229
 -0.008992   -0.02410933 -0.41038242  0.29807949 -0.26044142 -0.17478347
  0.18051535 -0.3792313  -0.14406045  0.26673809  0.26555544 -0.05673116
 -0.17148805 -0.06623498 -0.18849255 -0.11391533 -0.17411296  0.39583892
  0.36459151 -0.13401714 -0.10806058  0.19026239 -0.