In [2]:
import numpy as np
import glob
from gensim.models import word2vec
import gensim
import re
import logging

In [3]:
def cleanText(text):
	#assume all reveiws written in english, delete all non-ascii char
	text = text.encode('ascii','ignore').decode()
	
	#delete HTML tag
	text = re.sub(r'</?\w+[^>]*>','',text)
	
	#delete punctuation except char'char case(e.g. "haven't","can't","macy's")
	text = re.sub(" '|'\W|[-(),.\"!?#*$~`\{\}\[\]/+&*=:^]", " ", text)
		
	#transform several space into one space
	text = re.sub("\s+", " ", text)
		
	#transform all letters to lowercase
	text = text.lower().split()

	return text

In [4]:
def createInput(neg_path, pos_path):
	#
	#Gensim's word2vec input format is a list of lists, each list inside the list indicates a review. 
	#[['word1', 'word2', 'word3', '...'],['word1', 'word2', '...'], ..., ['...','...']]
	#

	print("Loading the imdb reviews data and clean the data")
	neg_files = glob.glob(neg_path + "/*.txt")
	pos_files = glob.glob(pos_path + "/*.txt")
	
	sentences = []
	
	for tnf in neg_files:
		f = open(tnf, 'r', errors='replace')

		line = f.read()

		#clean the data by delete punctuations and transform all uppercase to lowercase
		clean_line = cleanText(line)
		
		sentences.append(clean_line)
		
		f.close()
	
	for tpf in pos_files:
		f = open(tpf, 'r', errors='replace')
		line = f.read()
		clean_line = cleanText(line)
		sentences.append(clean_line)
		f.close()
	
	print("Data loaded and cleaned.")

	return sentences

In [5]:
def trainModel(sentences):
	#train word vector
	print("train word vector")
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
		level=logging.INFO)

	#set values for the parameters in Word2Vec
    print("parameters of training the model")
	print("dimension: word vector dimensionality")
	dimension = 200  
	print("min_count: any word that does not occur at least this many times across all documents is ignored")
	min_count = 5
	print("num_worders: number of threads to run in parallel")
	num_workers = 4
	print("window size")  
	window_size = 5
	#downsample setting for frequent words  
	downsampling = 1e-3  

	print("Training model")
	model = word2vec.Word2Vec(sentences, workers=num_workers,
							  size=dimension, min_count=min_count,
							  window=window_size, sample=downsampling, sg = 1)

	#
	#If finished training a model (no more updates, only querying), 
	#could do:
	# model.init_sims(replace=True)
	#to trim unneeded model memory = use (much) less RAM.
	#

	print("save the word vector model to disk")
	#specify path and model's name
	path = "/Users/pguo/Desktop/try/"
	fname = "wordVectorModel"
	model.save(path+fname)

	#train phrases model
	dimension = 200
	print("train phrases model, word vector's size is %d" %dimension)
	bigram_transformer = gensim.models.Phrases(sentences)
	model = word2vec.Word2Vec(bigram_transformer[sentences], workers=num_workers,
							  size=dimension, min_count=min_count,
							  window=window_size, sample=downsampling, sg = 1)

	print("save the phrases vector model to disk")
	#specify path and model's name
	path = "/Users/pguo/Desktop/try/"
	fname = "phrasesVectorModel"
	model.save(path+fname)

In [15]:
def explore(path):
	#load word vector model
	print("Load Word Vector Model")
	fname = "wordVectorModel"
	model = word2vec.Word2Vec.load(path+fname)
	
	#number of words, number of features
	print("show words vector's shape")
	print(model.syn0.shape)

	print("access individual word vector")
	print("for example, the word vector of 'best'")
	print(model["best"])

	print("show the word that is most dissimilar from the others")
	word_set = "best worst fine London"
	most_dissimilar = model.doesnt_match(word_set.split()) + '\n'
	print("for example, words are %s, the most dissimilar word is %s" %(word_set, most_dissimilar))	

	#do more complex queries like analogies such as: king - man + woman = queen 
	print("Show most similar words and corresponding distance to the given analogies")
	#indicate positive words
	pos = ['best', 'refreshing']
	#indicate negative words
	neg = ['worst']
	print("for example, the given positive words are %s, the given negative word is %s" %(pos, neg))
	most_similar = model.most_similar(positive=pos, negative=neg)
	print(most_similar)

	print("show the similarity between two words")
	word1 = "worst"
	word2 = "best"
	similar = model.similarity(word1, word2)
	print("for example, the similarity between %s and %s is %d" %(word1, word2, similar))
    
	print("test model's accuracy here")
	#test the quality of the word vectors
	#The accuracy depends heavily on the amount of the training data.
	#read the evaluation file, get it at:
	#https://code.google.com/archive/p/word2vec/source/default/source
	questions = "questions-words.txt"
	evals = open(path+questions, 'r').readlines()
	num_sections = len([l for l in evals if l.startswith(':')])
	print('total evaluation sentences: {} '.format(len(evals) - num_sections))
	#test accuracy of model
	accuracy = model.accuracy(path+questions)
	sum_corr = len(accuracy[-1]['correct'])
	sum_incorr = len(accuracy[-1]['incorrect'])
	total = sum_corr + sum_incorr
	percent = lambda a: a / total * 100
	print('Total sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, percent(sum_corr), percent(sum_incorr)))

	#phrases mode
	print("Load phrases model")
	fname = "phrasesVectorModel"
	model = word2vec.Word2Vec.load(path+fname)

	#indicate a given word
	given = 'los_angeles'
	print("Show most similar words and corresponding distance to the given phrase")
	print("for example, the given phrase is %s" %given)	
	similar_word = model.similar_by_word(given)
	print(similar_word)

In [16]:
#main
#specify the path of datasets
neg_path = "/Users/pguo/Desktop/try/aclImdb/train/neg"
pos_path = "/Users/pguo/Desktop/try/aclImdb/train/pos"

#use training data create input of word2vector
sentences = createInput(neg_path, pos_path)

#
#What word2vec does is to represent each word as a vector in a form 
#that reflects how close words occur. 
#The closer the words tend to occur in a text the more similar the vectors will be.
#

#use word2vec train model
trainModel(sentences)

#display what model can do
print("explore the model")
#model's path
path = "/Users/pguo/Desktop/try/"
explore(path)

Loading the imdb reviews data and clean the data
Data loaded and cleaned.
train word vector
dimension: word vector dimensionality
min_count: any word that does not occur at least this many times across all documents is ignored
number of threads to run in parallel
window size
Training model
save the word vector model to disk
train phrases, wordvector's size is 200
save the phrases vector model to disk
explore the model
Load Word Vector Model
show words vector's shape
(30321, 200)
access individual word vector
for example, the word vector of 'best'
[ -2.62199044e-01  -5.76151550e-01   4.25246619e-02  -2.76462585e-01
   1.17840350e-01   1.97694078e-01  -3.49061370e-01  -3.37192357e-01
   4.45775926e-01  -6.84153214e-02  -1.86107799e-01  -3.09626818e-01
   4.64470506e-01  -1.61453173e-01  -7.11624138e-03   2.54027963e-01
  -1.90099418e-01  -3.91215533e-02  -5.66844583e-01  -3.25414926e-01
  -2.38946527e-01  -1.70620799e-01   3.40528935e-02  -2.18186274e-01
  -1.54683143e-02   4.58015144e-0