In [1]:
import numpy as np
import glob
from gensim.models import word2vec
import re
import logging
import copy
from sklearn.linear_model import LogisticRegression

In [2]:
def cleanText(text):
	#assume all reveiws written in english, delete all non-ascii char
	text = text.encode('ascii','ignore').decode()
	
	#delete HTML tag
	text = re.sub(r'</?\w+[^>]*>','',text)
	
	#delete punctuation except char'char case(e.g. "haven't","can't","macy's")
	text = re.sub(" '|'\W|[-(),.\"!?#*$~`\{\}\[\]/+&*=:^]", " ", text)
		
	#transform several space into one space
	text = re.sub("\s+", " ", text)
		
	#transform all letters to lowercase
	text = text.lower().split()

	return text

In [4]:
def createInput(neg_path, pos_path):
	#
	#Gensim's word2vec input format is a list of lists, each list inside the list indicates a review. 
	#[['word1', 'word2', 'word3', '...'],['word1', 'word2', '...'], ..., ['...','...']]
	#

	print("Loading the imdb reviews data and clean the data")
	neg_files = glob.glob(neg_path + "/*.txt")
	pos_files = glob.glob(pos_path + "/*.txt")
	
	sentences = []
	y = []
	
	for tnf in neg_files:
		f = open(tnf, 'r', errors='replace')

		line = f.read()

		#clean the data by delete punctuations and transform all uppercase to lowercase
		clean_line = cleanText(line)
		
		sentences.append(clean_line)

		#also generate corresponding y label
		y.append(0)
		
		f.close()
	
	for tpf in pos_files:
		f = open(tpf, 'r', errors='replace')
		line = f.read()
		clean_line = cleanText(line)
		sentences.append(clean_line)
		y.append(1)
		f.close()
	
	print("Data loaded and cleaned.")

	return sentences, y

In [5]:
def trainModel(sentences):
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
		level=logging.INFO)

	#set values for the parameters in Word2Vec
	#dimension: word vector dimensionality
	dimension = 200  
	#min_count: any word that does not occur at least this many times across all documents is ignored
	min_count = 5
	#number of threads to run in parallel
	num_workers = 4
	#window size  
	window_size = 5
	#downsample setting for frequent words  
	downsampling = 1e-3  

	print("Training model")
	model = word2vec.Word2Vec(sentences, workers=num_workers,
							  size=dimension, min_count=min_count,
							  window=window_size, sample=downsampling, sg = 1)

	#
	#If finished training a model (no more updates, only querying), 
	#could do:
	# model.init_sims(replace=True)
	#to trim unneeded model memory = use (much) less RAM.
	#

	#save the model to disk
	#specify path and model's name
	path = "/Users/pguo/Desktop/try/"
	fname = "trained_model"
	model.save(path+fname)

In [6]:
def createFeature(x, model):
	#
	#use method that averaging the word vectors in each review
	#For example,
	#review: "Amy is beautiful"
	#"Amy" = [0.3, 0.6, 0.8]
	#"is" = [1.2, 3.5, 4.6]
	#"beautiful" = [0.9, 1.2, 8.7]
	#then the vector of the review 
	#= [(0.3 + 1.2 + 0.9)/3, (0.6 + 3.5 + 1.2)/3, (0.8 + 4.6 + 8.7)/3]
	#= [0.8, 1.77, 4.7]
	#
	
	#review_index indicates the ith review
	review_index = 0
	#number of features, equals to the dimension(colum) of model's vocabulary
	num_features = model.syn0.shape[1]
	features = np.zeros((len(x),num_features), dtype=np.float32)

	#model.index2word is a list of the names of the words in the model's vocabulary.
	#convert to set in order to increase the searching speed
	vocab = set(model.index2word)

	for review in x:
		#total words in a review
		totalwords = 0
		for word in review:
			if word in vocab:
				totalwords += 1
				features[review_index] = np.add(features[review_index], model[word])
		features[review_index] = np.divide(features[review_index],totalwords)
		#next review
		review_index += 1
	return features

In [7]:
#main
#specify the path of datasets
neg_path = "/Users/pguo/Desktop/try/aclImdb/train/neg"
pos_path = "/Users/pguo/Desktop/try/aclImdb/train/pos"

#first use training data create input of word2vector
#also create the corresponding training label
sentences, y_train = createInput(neg_path, pos_path)

#save x_train for later use
#in gensim, the x_train's format is same as the word2vec training model's input
x_train = copy.deepcopy(sentences)

#First use word2vec generate vector for each word
#use word2vec train model
trainModel(sentences)

#classification
#model's path
path = "/Users/pguo/Desktop/try/"
#model's name
fname = "trained_model"
#load model
print("Load Model")
model = word2vec.Word2Vec.load(path+fname)

#use word vector transform review to features
train_features = createFeature(x_train, model)

#create input of test data
print("create test data")
neg_path = "/Users/pguo/Desktop/try/aclImdb/test/neg"
pos_path = "/Users/pguo/Desktop/try/aclImdb/test/pos"
x_test, y_test = createInput(neg_path, pos_path)

#use word vector transform review to features
test_features = createFeature(x_test, model)
	
#classification
print("classification")
clf=LogisticRegression(penalty='l1', C=0.1)
clf.fit(train_features, y_train)
print("Accuracy on test: %0.4f" %clf.score(test_features, y_test))


Loading the imdb reviews data and clean the data
Data loaded and cleaned.
Training model
Load Model
create test data
Loading the imdb reviews data and clean the data
Data loaded and cleaned.
classification
Accuracy on test: 0.8245
