In [1]:
# Word2vec, published by Google in 2013, is a neural network implementation that 
# learns distributed representations for words. Other deep or recurrent neural
# network architectures had been proposed for learning word representations prior to this,
# but the major problem with these was the long time required to train the models. 
# Word2vec learns quickly relative to other models.

# Both Google's version and the Python version rely on multi-threading 
# ln order to train your model in a reasonable amount of time, you will
# need to install cython
import pandas as pd

# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [10]:
# To train Word2Vec it is better not to remove stop words because 
# the algorithm relies on the broader context of the sentence in 
# order to produce high-quality word vectors.

# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=False, remove_nums=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Optionally Remove non-letters
    if remove_nums:
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [11]:
# The input of Word2Vec is list of lists:
# Load the punkt tokenizer
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences


In [13]:
sentences = []  # Initialize an empty list of sentences
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    
print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [14]:
len(sentences)

795538

In [15]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2019-03-03 16:00:41,307 : INFO : 'pattern' package not found; tag filters are not available for English
2019-03-03 16:00:41,313 : INFO : collecting all words and their counts
2019-03-03 16:00:41,314 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-03 16:00:41,370 : INFO : PROGRESS: at sentence #10000, processed 219242 words, keeping 32665 word types
2019-03-03 16:00:41,427 : INFO : PROGRESS: at sentence #20000, processed 438623 words, keeping 51663 word types
2019-03-03 16:00:41,482 : INFO : PROGRESS: at sentence #30000, processed 651476 words, keeping 66881 word types


Training model...


2019-03-03 16:00:41,544 : INFO : PROGRESS: at sentence #40000, processed 871114 words, keeping 80990 word types
2019-03-03 16:00:41,603 : INFO : PROGRESS: at sentence #50000, processed 1083691 words, keeping 93535 word types
2019-03-03 16:00:41,659 : INFO : PROGRESS: at sentence #60000, processed 1298869 words, keeping 104807 word types
2019-03-03 16:00:41,717 : INFO : PROGRESS: at sentence #70000, processed 1515513 words, keeping 115640 word types
2019-03-03 16:00:41,773 : INFO : PROGRESS: at sentence #80000, processed 1728384 words, keeping 125785 word types
2019-03-03 16:00:41,833 : INFO : PROGRESS: at sentence #90000, processed 1945448 words, keeping 136196 word types
2019-03-03 16:00:41,893 : INFO : PROGRESS: at sentence #100000, processed 2160633 words, keeping 145760 word types
2019-03-03 16:00:41,949 : INFO : PROGRESS: at sentence #110000, processed 2373736 words, keeping 154951 word types
2019-03-03 16:00:42,005 : INFO : PROGRESS: at sentence #120000, processed 2589502 words, 

2019-03-03 16:00:46,114 : INFO : PROGRESS: at sentence #760000, processed 16481533 words, keeping 563563 word types
2019-03-03 16:00:46,176 : INFO : PROGRESS: at sentence #770000, processed 16701810 words, keeping 568626 word types
2019-03-03 16:00:46,239 : INFO : PROGRESS: at sentence #780000, processed 16924889 words, keeping 573568 word types
2019-03-03 16:00:46,299 : INFO : PROGRESS: at sentence #790000, processed 17144985 words, keeping 578535 word types
2019-03-03 16:00:46,335 : INFO : collected 581308 word types from a corpus of 17264346 raw words and 795538 sentences
2019-03-03 16:00:46,336 : INFO : Loading a fresh vocabulary
2019-03-03 16:00:46,582 : INFO : min_count=40 retains 20587 unique words (3% of original 581308, drops 560721)
2019-03-03 16:00:46,582 : INFO : min_count=40 leaves 15706923 word corpus (90% of original 17264346, drops 1557423)
2019-03-03 16:00:46,654 : INFO : deleting the raw counts dictionary of 581308 items
2019-03-03 16:00:46,668 : INFO : sample=0.001 d

2019-03-03 16:01:31,778 : INFO : EPOCH - 4 : training on 17264346 raw words (11695116 effective words) took 11.5s, 1016121 effective words/s
2019-03-03 16:01:32,790 : INFO : EPOCH 5 - PROGRESS: at 8.01% examples, 929917 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:33,794 : INFO : EPOCH 5 - PROGRESS: at 16.76% examples, 970145 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:34,799 : INFO : EPOCH 5 - PROGRESS: at 25.41% examples, 980939 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:35,805 : INFO : EPOCH 5 - PROGRESS: at 33.98% examples, 982345 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:36,806 : INFO : EPOCH 5 - PROGRESS: at 42.37% examples, 983214 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:37,818 : INFO : EPOCH 5 - PROGRESS: at 50.73% examples, 980856 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:38,818 : INFO : EPOCH 5 - PROGRESS: at 59.20% examples, 983614 words/s, in_qsize 7, out_qsize 0
2019-03-03 16:01:39,820 : INFO : EPOCH 5 - PROGRESS: at 67.75% exampl

In [16]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [17]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [18]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('man,', 0.800133228302002),
 ('man.', 0.7126742601394653),
 ('woman', 0.6832668781280518),
 ('soldier', 0.6417524814605713),
 ('lady', 0.6401040554046631),
 ('lad', 0.6261498332023621),
 ('monk', 0.6240255236625671),
 ('doctor', 0.6199002265930176),
 ('boy', 0.6128771305084229),
 ("man's", 0.6039730906486511)]

In [19]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.7634856700897217),
 ('sylvia', 0.7149850130081177),
 ('queen,', 0.7045449614524841),
 ('maid', 0.6910901069641113),
 ('countess', 0.6799722909927368),
 ('prince', 0.6751097440719604),
 ('mrs.', 0.6603979468345642),
 ('bride', 0.6592644453048706),
 ('belle', 0.6546525955200195),
 ('maria', 0.6522542238235474)]

In [20]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('atrocious', 0.744181752204895),
 ('terrible', 0.7220227718353271),
 ('horrible', 0.7034926414489746),
 ('awful,', 0.6793063282966614),
 ('dreadful', 0.6791330575942993),
 ('abysmal', 0.6705625057220459),
 ('amateurish', 0.6406466364860535),
 ('horrid', 0.6367478370666504),
 ('appalling', 0.635163426399231),
 ('horrendous', 0.6152829527854919)]

In [21]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

2019-03-03 16:09:30,218 : INFO : loading Word2Vec object from 300features_40minwords_10context
2019-03-03 16:09:30,576 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2019-03-03 16:09:30,576 : INFO : setting ignored attribute vectors_norm to None
2019-03-03 16:09:30,577 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2019-03-03 16:09:30,578 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2019-03-03 16:09:30,578 : INFO : setting ignored attribute cum_table to None
2019-03-03 16:09:30,579 : INFO : loaded 300features_40minwords_10context


In [22]:
type(model.wv.syn0)

  """Entry point for launching an IPython kernel.


numpy.ndarray

In [23]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(20587, 300)

In [25]:
model["flower"].shape

  """Entry point for launching an IPython kernel.


(300,)