In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import logging
from gensim.models import word2vec 

In [22]:
#load train/test data with pandas
train = pd.read_csv("./dataset/labeledTrainData.tsv",
                    header=0,
                    delimiter='\t',
                    quoting=3)

test = pd.read_csv("./dataset/testData.tsv",
                   header=0,delimiter='\t',
                   quoting=3)

unlabeled_train = pd.read_csv("./dataset/unlabeledTrainData.tsv",
                              header=0,
                              delimiter='\t',
                              quoting=3)
print("train:{} \ntest:{}\nunlabeled:{}\ntotal_size:{}".format(train["review"].size,
                                                                    test["review"].size,
                                                                    unlabeled_train["review"].size,
                                                                    train["review"].size+test["review"].size+unlabeled_train["review"].size))

train:25000 
test:25000
unlabeled:50000
total_size:100000


In [29]:
#clean review with option : whether remove stop word
def review_to_wordlist(review,remove_stopwords=False):
    #Function to convert a document to a sequence of words
    #1.remove Html
    raw_review = BeautifulSoup(review).get_text()
    #2.remove NON-charaters
    cleaned_words = re.sub("[^a-zA-Z]"," ",raw_review)
    #3.lower case and split review into word list
    words = cleaned_words.lower().split()
    #4.remove stopwords if said
    if remove_stopwords == True:
        stop_words = set(stopwords.words("english"))
        words = [word for word in wors if word not in stop_words]
    return words

In [36]:
#word2vec expects single sentence,each one as a list of words
#in other words ,we have to split review into sentence
#load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#split review into parsed sentences
def review_to_sentence(review,tokenizer,remove_stopwords=False):
    #1.use the NLTK tokenizer to split review into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    #loop over each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            words = review_to_wordlist(raw_sentence,remove_stopwords)
            sentences.append(words)
    return sentences

In [38]:
sentences = review_to_sentence(train["review"][0],tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [44]:
#prepare the data for input to word2vec
sentences = []
#from training set
print("parsing sentence from training set")
for review in train["review"]:
    sentences += review_to_sentence(review,tokenizer)
    
#from unlabeled training set
print("parsing sentence from unlabeled training set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentence(review,tokenizer)

parsing sentence from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


parsing sentence from unlabeled training set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [47]:
len(sentences)

795538

In [51]:
logging.basicConfig(format='%(levelname)s :%(message)s',
                   level=logging.INFO)
#set values for various parameters
num_features = 300
min_word_count = 40
num_workers = 4       #num of threads to run in parallel
context = 10
downsampling = 1e-3   #downsample setting for frequent words


print("training model")
model = word2vec.Word2Vec(sentences,
                          workers=num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling
                         )
model.init_sims(replace=True)
model_name = "300features_40minwords_10contex"
model.save(model_name)

training model


INFO :collecting all words and their counts
INFO :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO :PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
INFO :PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
INFO :PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
INFO :PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
INFO :PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
INFO :PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
INFO :PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
INFO :PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
INFO :PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
INFO :PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
INFO :PROGRESS: at sentence

INFO :EPOCH 1 - PROGRESS: at 45.66% examples, 964171 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 53.27% examples, 965166 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 59.61% examples, 946532 words/s, in_qsize 8, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 67.18% examples, 947932 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 75.48% examples, 958852 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 83.11% examples, 959648 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 90.93% examples, 962767 words/s, in_qsize 7, out_qsize 0
INFO :EPOCH 1 - PROGRESS: at 98.66% examples, 964526 words/s, in_qsize 7, out_qsize 0
INFO :worker thread finished; awaiting finish of 3 more threads
INFO :worker thread finished; awaiting finish of 2 more threads
INFO :worker thread finished; awaiting finish of 1 more threads
INFO :worker thread finished; awaiting finish of 0 more threads
INFO :EPOCH - 1 : training on 17798082 raw words (127499

In [58]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [59]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [60]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6220847368240356),
 ('lady', 0.5995252132415771),
 ('lad', 0.5814304351806641),
 ('monk', 0.5453358888626099),
 ('men', 0.5275775790214539),
 ('guy', 0.5240290760993958),
 ('soldier', 0.522733747959137),
 ('millionaire', 0.5168834924697876),
 ('businessman', 0.5168582797050476),
 ('doctor', 0.5096631050109863)]

In [61]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7869566679000854),
 ('atrocious', 0.7530655860900879),
 ('horrible', 0.7484368085861206),
 ('abysmal', 0.7052768468856812),
 ('dreadful', 0.7043105959892273),
 ('horrendous', 0.6878989934921265),
 ('horrid', 0.6714105606079102),
 ('appalling', 0.6688593626022339),
 ('lousy', 0.6357158422470093),
 ('amateurish', 0.6351268291473389)]