# Data Processing Functions for Word2Vec

In [49]:
import pandas as pd
import numpy as np

In [50]:
# importing all necessary modules 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 
  
#  Reads ‘alice.txt’ file 
sample = open("../DATA_Store/frankenstein_fulltext.txt", "r", encoding="utf8") 
s = sample.read() 
  
# Replaces escape character with space 
f = s.replace("\n", " ")
  
data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower())  # all words are lower case
  
    data.append(temp) 

In [55]:
# NEED TO IMPLEMENT PORTER STEMMER and REMOVE PUNCTUATION

In [56]:
# remove stop words
stop_words = set(stopwords.words('english')) 
filtered_data = [[w for w in data[i] if not w in stop_words] for i in range(len(data))] 

In [57]:
filtered_data

[['mrs.', 'saville', ',', 'england', '.'],
 ['st.', 'petersburgh', ',', 'dec.', '11th', ',', '17—', '.'],
 ['rejoice',
  'hear',
  'disaster',
  'accompanied',
  'commencement',
  'enterprise',
  'regarded',
  'evil',
  'forebodings',
  '.'],
 ['arrived',
  'yesterday',
  ',',
  'first',
  'task',
  'assure',
  'dear',
  'sister',
  'welfare',
  'increasing',
  'confidence',
  'success',
  'undertaking',
  '.'],
 ['already',
  'far',
  'north',
  'london',
  ',',
  'walk',
  'streets',
  'petersburgh',
  ',',
  'feel',
  'cold',
  'northern',
  'breeze',
  'play',
  'upon',
  'cheeks',
  ',',
  'braces',
  'nerves',
  'fills',
  'delight',
  '.'],
 ['understand', 'feeling', '?'],
 ['breeze',
  ',',
  'travelled',
  'regions',
  'towards',
  'advancing',
  ',',
  'gives',
  'foretaste',
  'icy',
  'climes',
  '.'],
 ['inspirited',
  'wind',
  'promise',
  ',',
  'daydreams',
  'become',
  'fervent',
  'vivid',
  '.'],
 ['try',
  'vain',
  'persuaded',
  'pole',
  'seat',
  'frost',
  'd

In [58]:
len(filtered_data)

3083

In [59]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = -1      # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(filtered_data, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2019-11-06 08:25:26,835 : INFO : collecting all words and their counts
2019-11-06 08:25:26,836 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-06 08:25:26,853 : INFO : collected 7165 word types from a corpus of 44635 raw words and 3083 sentences
2019-11-06 08:25:26,853 : INFO : Loading a fresh vocabulary
2019-11-06 08:25:26,864 : INFO : effective_min_count=10 retains 814 unique words (11% of original 7165, drops 6351)
2019-11-06 08:25:26,866 : INFO : effective_min_count=10 leaves 29755 word corpus (66% of original 44635, drops 14880)
2019-11-06 08:25:26,870 : INFO : deleting the raw counts dictionary of 7165 items
2019-11-06 08:25:26,871 : INFO : sample=0.001 downsamples 34 most-common words
2019-11-06 08:25:26,871 : INFO : downsampling leaves estimated 20601 word corpus (69.2% of prior 29755)
2019-11-06 08:25:26,875 : INFO : estimated required memory for 814 words and 300 dimensions: 2360600 bytes
2019-11-06 08:25:26,877 : INFO : resetting layer weig

Training model...


In [66]:
model.most_similar("frankenstein")

[('scenes', 0.16669809818267822),
 ('allow', 0.16083307564258575),
 ('affection', 0.15596412122249603),
 ('former', 0.14493198692798615),
 ('found', 0.1414320170879364),
 ('m.', 0.138254776597023),
 ('story', 0.13059641420841217),
 ('letters', 0.12909497320652008),
 ('beheld', 0.12585705518722534),
 ('union', 0.1223391592502594)]

# TSNE Analysis

In [65]:
model

<gensim.models.word2vec.Word2Vec at 0x1fa048e3208>