In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import logging
from gensim.models import word2vec 

In [2]:
#load train/test data with pandas
train = pd.read_csv("./dataset/labeledTrainData.tsv",
                    header=0,
                    delimiter='\t',
                    quoting=3)

test = pd.read_csv("./dataset/testData.tsv",
                   header=0,delimiter='\t',
                   quoting=3)

unlabeled_train = pd.read_csv("./dataset/unlabeledTrainData.tsv",
                              header=0,
                              delimiter='\t',
                              quoting=3)
print("train:{} \ntest:{}\nunlabeled:{}\ntotal_size:{}".format(train["review"].size,
                                                                    test["review"].size,
                                                                    unlabeled_train["review"].size,
                                                                    train["review"].size+test["review"].size+unlabeled_train["review"].size))

train:25000 
test:25000
unlabeled:50000
total_size:100000


In [3]:
#clean review with option : whether remove stop word
def review_to_wordlist(review,remove_stopwords=False):
    #Function to convert a document to a sequence of words
    #1.remove Html
    raw_review = BeautifulSoup(review).get_text()
    #2.remove NON-charaters
    cleaned_words = re.sub("[^a-zA-Z]"," ",raw_review)
    #3.lower case and split review into word list
    words = cleaned_words.lower().split()
    #4.remove stopwords if said
    if remove_stopwords == True:
        stop_words = set(stopwords.words("english"))
        words = [word for word in wors if word not in stop_words]
    return words

In [4]:
#word2vec expects single sentence,each one as a list of words
#in other words ,we have to split review into sentence
#load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#split review into parsed sentences
def review_to_sentence(review,tokenizer,remove_stopwords=False):
    #1.use the NLTK tokenizer to split review into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    #loop over each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            words = review_to_wordlist(raw_sentence,remove_stopwords)
            sentences.append(words)
    return sentences

In [5]:
sentences = review_to_sentence(train["review"][0],tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [6]:
#prepare the data for input to word2vec
sentences = []
#from training set
print("parsing sentence from training set")
for review in train["review"]:
    sentences += review_to_sentence(review,tokenizer)
    
#from unlabeled training set
print("parsing sentence from unlabeled training set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentence(review,tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


parsing sentence from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


parsing sentence from unlabeled training set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
len(sentences)

795538

In [8]:
logging.basicConfig(format='%(levelname)s :%(message)s',
                   level=logging.INFO)
#set values for various parameters
num_features = 300
min_word_count = 40
num_workers = 4       #num of threads to run in parallel
context = 10
downsampling = 1e-3   #downsample setting for frequent words


print("training model")
model = word2vec.Word2Vec(sentences,
                          workers=num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling
                         )
model.init_sims(replace=True)
model_name = "300features_40minwords_10contex"
model.save(model_name)

INFO :collecting all words and their counts
INFO :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO :PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
INFO :PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
INFO :PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
INFO :PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
INFO :PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
INFO :PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types


training model


INFO :PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
INFO :PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
INFO :PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
INFO :PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
INFO :PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
INFO :PROGRESS: at sentence #120000, processed 2668775 words, keeping 54119 word types
INFO :PROGRESS: at sentence #130000, processed 2894303 words, keeping 55847 word types
INFO :PROGRESS: at sentence #140000, processed 3107005 words, keeping 57346 word types
INFO :PROGRESS: at sentence #150000, processed 3332627 words, keeping 59055 word types
INFO :PROGRESS: at sentence #160000, processed 3555315 words, keeping 60617 word types
INFO :PROGRESS: at sentence #170000, processed 3778655 words, keeping 62077 word types
INFO :PROGRESS: at sentence #180000, processed

INFO :worker thread finished; awaiting finish of 1 more threads
INFO :worker thread finished; awaiting finish of 0 more threads
INFO :EPOCH - 1 : training on 17798082 raw words (12749834 effective words) took 11.9s, 1071624 effective words/s
INFO :EPOCH 2 - PROGRESS: at 8.39% examples, 1060802 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 17.00% examples, 1067577 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 25.57% examples, 1072399 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 34.16% examples, 1075682 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 42.63% examples, 1077234 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 51.13% examples, 1078955 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 59.45% examples, 1077372 words/s, in_qsize 8, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 67.92% examples, 1077712 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 2 - PROGRESS: at 76.43% examples, 1078173 words/s, 

In [9]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [10]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [11]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6122967004776001),
 ('lady', 0.5903916358947754),
 ('lad', 0.582253634929657),
 ('monk', 0.5394635200500488),
 ('men', 0.5153583288192749),
 ('farmer', 0.5147213935852051),
 ('soldier', 0.5044882893562317),
 ('person', 0.4987906217575073),
 ('guy', 0.49783599376678467),
 ('millionaire', 0.49595922231674194)]

In [12]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7694653272628784),
 ('horrible', 0.726334810256958),
 ('atrocious', 0.7161608338356018),
 ('abysmal', 0.7082933187484741),
 ('dreadful', 0.7078591585159302),
 ('appalling', 0.6694698929786682),
 ('horrendous', 0.6606138944625854),
 ('horrid', 0.6556227207183838),
 ('lousy', 0.609402596950531),
 ('amateurish', 0.6081972122192383)]