##                                                                 Import required libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords

## Read data

In [2]:
train = pd.read_csv('labeledTrainData.tsv',delimiter='\t',header=0,quoting=3)
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [3]:
print(train.review.size+test.review.size+unlabeled_train.review.size)

100000


## String cleaning

In [7]:
from bs4 import BeautifulSoup
def review_to_wordlist(review,remove_stopwords = False):
    review_text = BeautifulSoup(review,'html5lib').get_text()
    review_text = re.sub("[^A-Za-z]"," ",review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    return words

## Convert paragraphs to single sentences

In [11]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

## Prepare data for word2vec

In [12]:
sentences = []
print('parsing sentences from training data set..')
for review in train.review:
    sentences += review_to_sentences(review, tokenizer)
print('parsing sentences from unlabeled data set..')
for review in unlabeled_train.review:
    sentences += review_to_sentences(review, tokenizer)

parsing sentences from training data set..


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


parsing sentences from unlabeled data set..


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


#### A minor detail to note is the difference between the "+=" and "append" when it comes to Python lists. In many applications the two are interchangeable, but here they are not. If you are appending a list of lists to another list of lists, "append" will only append the first list; you need to use "+=" in order to join all of the lists at once.

## Train and save model

In [27]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

#Initialize and train the model
from gensim.models import word2vec
print('training model..')
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-09-16 19:56:19,560 : INFO : collecting all words and their counts
2017-09-16 19:56:19,567 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-16 19:56:19,670 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-09-16 19:56:19,755 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


training model..


2017-09-16 19:56:19,855 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-09-16 19:56:19,952 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-09-16 19:56:20,038 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-09-16 19:56:20,127 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-09-16 19:56:20,226 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-09-16 19:56:20,317 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-09-16 19:56:20,404 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-09-16 19:56:20,495 : INFO : PROGRESS: at sentence #100000, processed 2226967 words, keeping 50207 word types
2017-09-16 19:56:20,582 : INFO : PROGRESS: at sentence #110000, processed 2446581 words, keeping 

2017-09-16 19:56:26,577 : INFO : PROGRESS: at sentence #750000, processed 16771406 words, keeping 120295 word types
2017-09-16 19:56:26,671 : INFO : PROGRESS: at sentence #760000, processed 16990810 words, keeping 120930 word types
2017-09-16 19:56:26,768 : INFO : PROGRESS: at sentence #770000, processed 17217947 words, keeping 121703 word types
2017-09-16 19:56:26,865 : INFO : PROGRESS: at sentence #780000, processed 17448093 words, keeping 122402 word types
2017-09-16 19:56:26,962 : INFO : PROGRESS: at sentence #790000, processed 17675169 words, keeping 123066 word types
2017-09-16 19:56:27,023 : INFO : collected 123504 word types from a corpus of 17798270 raw words and 795538 sentences
2017-09-16 19:56:27,025 : INFO : Loading a fresh vocabulary
2017-09-16 19:56:27,166 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2017-09-16 19:56:27,167 : INFO : min_count=40 leaves 17239125 word corpus (96% of original 17798270, drops 559145)
2017-09-16 19:5

2017-09-16 19:57:35,121 : INFO : PROGRESS: at 31.73% examples, 300966 words/s, in_qsize 6, out_qsize 1
2017-09-16 19:57:36,167 : INFO : PROGRESS: at 32.24% examples, 301174 words/s, in_qsize 6, out_qsize 1
2017-09-16 19:57:37,175 : INFO : PROGRESS: at 32.72% examples, 301229 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:57:38,199 : INFO : PROGRESS: at 33.22% examples, 301424 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:57:39,224 : INFO : PROGRESS: at 33.72% examples, 301506 words/s, in_qsize 8, out_qsize 1
2017-09-16 19:57:40,275 : INFO : PROGRESS: at 34.23% examples, 301674 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:57:41,299 : INFO : PROGRESS: at 34.72% examples, 301761 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:57:42,315 : INFO : PROGRESS: at 35.23% examples, 301963 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:57:43,329 : INFO : PROGRESS: at 35.71% examples, 301983 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:57:44,335 : INFO : PROGRESS: at 36.21% examples, 302117 wor

2017-09-16 19:58:56,463 : INFO : PROGRESS: at 70.52% examples, 302592 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:58:57,487 : INFO : PROGRESS: at 71.01% examples, 302622 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:58:58,492 : INFO : PROGRESS: at 71.48% examples, 302641 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:58:59,503 : INFO : PROGRESS: at 71.96% examples, 302654 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:59:00,524 : INFO : PROGRESS: at 72.46% examples, 302734 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:59:01,533 : INFO : PROGRESS: at 72.91% examples, 302609 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:59:02,577 : INFO : PROGRESS: at 73.39% examples, 302553 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:59:03,595 : INFO : PROGRESS: at 73.86% examples, 302503 words/s, in_qsize 8, out_qsize 0
2017-09-16 19:59:04,621 : INFO : PROGRESS: at 74.33% examples, 302441 words/s, in_qsize 7, out_qsize 0
2017-09-16 19:59:05,645 : INFO : PROGRESS: at 74.79% examples, 302335 wor

In [14]:
model.doesnt_match("pizza pasta cheese gym".split())



'cheese'

In [28]:
from gensim.models import Word2Vec
Model = Word2Vec.load("300features_40minwords_10context")

2017-09-16 20:00:16,032 : INFO : loading Word2Vec object from 300features_40minwords_10context
2017-09-16 20:00:25,460 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2017-09-16 20:00:25,462 : INFO : setting ignored attribute syn0norm to None
2017-09-16 20:00:25,466 : INFO : setting ignored attribute cum_table to None
2017-09-16 20:00:25,468 : INFO : loaded 300features_40minwords_10context


In [29]:
type(Model.syn0_lockf)

numpy.ndarray

In [33]:
Model.syn0_lockf

array([ 1.,  1.,  1., ...,  1.,  1.,  1.], dtype=float32)