# Creating Word Vectors using the word2vec in Python

In this first example we will use the Gutenberg corpus of 18 books.

In [2]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline



#### Using the Gutenberg Corpus
The Gutenberg Corpus is a set of 18 books available in the NLTK for research purposes.
The NLTK provides utility methods to access directly sentences and tokens in the corpus without the need of using the sentence tokenizer

In [6]:
from nltk.corpus import gutenberg

In [7]:
gutenberg.words()
print (len(gutenberg.words()))

2621613


In [8]:
sentences = gutenberg.sents()
print (len(sentences))
print (sentences[0:5])
print (sentences[4][7])

98552
[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ['CHAPTER', 'I'], ['Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty', '-', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.'], ['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'", 's', 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.']]
daughters


#### Appling pre-processing

In [None]:
def remove_punctuation(corpus):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_corpus = [token for token in corpus if (not token in punctuations)]
    return filtered_corpus

def apply_stopwording(corpus, min_len):
    filtered_corpus = [token.lower() for token in corpus if (not token in stopwords.words('english') and len(token)>min_len)]
    return filtered_corpus

def apply_lemmatization(corpus):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_corpus = [lemmatizer.lemmatize(token) for token in corpus]
    return normalized_corpus

In [None]:
g_sentences=[]
for sentence in sentences:
    g_sentences.append(apply_lemmatization(apply_stopwording(remove_punctuation(sentence),3)))

In [None]:
print(g_sentences[0:5])

#### Creating the Word2Vec model
Using the sentences extracted in the previous step, we will create the Word2Vec model. Keep in mind we don't have a large corpus to generate the w2v model, so I am not expecting great results.

Parameters:
  - Sentences: the list of sentences
  - size: the # of dimensions of the Word2Vec space being generated
  - sg (skip grams): we are going to use the Skip Gram algorithm (this is a small dataset)
  - window: window size for the skip grams
  - min_count: minimum number of times a word must appear to be considered
  - seed: for replicatebility 
  - workers: CPU cores to use for running the model

In [9]:
w2v_model = Word2Vec(sentences=sentences,size=64, sg=1, window = 10, min_count=5, seed = 20, workers=4)

#You can save the model so you can reuse it later
#w2v_model.save('./models/gutenberg.w2v')

#You can reload a saved model
#w2v_model = gensim.models.Word2Vec.load('./models/gutenberg.w2v')

In [10]:
print (len(w2v_model.wv.vocab))

17011


In [11]:
# Each term is a vector in a 64-dimensional space
len(w2v_model['whale'])

  


64

In [14]:
# Try words like 'ship', 'day', 'father'
w2v_model.most_similar('ship')

  


[('Pequod', 0.8435871601104736),
 ('boat', 0.8204550743103027),
 ('sail', 0.818461000919342),
 ('boats', 0.7798950672149658),
 ('whale', 0.7710714936256409),
 ('alongside', 0.7594534158706665),
 ('sailed', 0.7512128949165344),
 ('fore', 0.7473894953727722),
 ('craft', 0.7312824130058289),
 ('wrecked', 0.7290410995483398)]

In [16]:
# Which term is probabilistically far from the listed concepts?
print(w2v_model.doesnt_match(['ship','boat','craft']))
print (w2v_model.similarity('ship','boat'))
print (w2v_model.similarity('ship','craft'))
print (w2v_model.similarity('boat','craft'))


ship
0.820455071013
0.731282419685
0.824999268123


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [20]:
# Let's try a similar operation as the famous kings-man+woman=queen
# We will try father-man+woman=?
print(w2v_model.most_similar(positive=['husband','woman'], negative=['man'],topn=30))

[('wife', 0.732563853263855), ('conceived', 0.7277275323867798), ('child', 0.6995440125465393), ('daughter', 0.6981982588768005), ('sister', 0.6927369832992554), ('mother', 0.6680464148521423), ('adultery', 0.6381374597549438), ('Sarai', 0.6335729360580444), ('maid', 0.6334989070892334), ('womb', 0.6312903165817261), ('whoredoms', 0.6200411319732666), ('bare', 0.6141468286514282), ('Rachel', 0.6133252382278442), ('widow', 0.609911322593689), ('Tamar', 0.6098790168762207), ('elder', 0.60738205909729), ('Sarah', 0.6039078235626221), ('marry', 0.6033474206924438), ('loved', 0.6026597023010254), ('damsel', 0.6025072336196899), ('nurse', 0.6019350290298462), ('brother', 0.5941097736358643), ('harlot', 0.593155026435852), ('Rebekah', 0.5928415656089783), ('married', 0.5917892456054688), ('betrothed', 0.5879618525505066), ('committeth', 0.5867643356323242), ('Abram', 0.5811938643455505), ('eldest', 0.5793421268463135), ('herself', 0.5786761045455933)]


  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
print(w2v_model.most_similar(positive=['son','woman'], negative=['man']))

[('daughter', 0.767130970954895), ('Sarai', 0.7470760941505432), ('conceived', 0.7444008588790894), ('Leah', 0.7269483804702759), ('wife', 0.7249138951301575), ('Hagar', 0.7184933423995972), ('Rachel', 0.7184392213821411), ('Sarah', 0.7154568433761597), ('Bilhah', 0.7103646397590637), ('Joseph', 0.7061225771903992)]


  """Entry point for launching an IPython kernel.


In [22]:
print(w2v_model.most_similar(positive=['husband','woman'], negative=['man']))

[('wife', 0.732563853263855), ('conceived', 0.7277275323867798), ('child', 0.6995440125465393), ('daughter', 0.6981982588768005), ('sister', 0.6927369832992554), ('mother', 0.6680464148521423), ('adultery', 0.6381374597549438), ('Sarai', 0.6335729360580444), ('maid', 0.6334989070892334), ('womb', 0.6312903165817261)]


  """Entry point for launching an IPython kernel.


## Reducing the hyper-dimensionality with t-SNE
We covered t-SNE in a previous lecture and used as dimension reduction method to represent high dimensionality data into a plane (2D) or a cube (3D). We apply t-SNE to word vectors so to transform the 64 dimensional space into a 2D space.

In [23]:
#Retrieving the vocabulary from the 64-dimensional space
X_64D=w2v_model[w2v_model.wv.vocab]

  


In [24]:
# Transform the data and load up a Panda dataframe
tSNE = TSNE(n_components=2, n_iter=1000)
X_2D = tSNE.fit_transform(X_64D)
x2D_df = pd.DataFrame(X_2D, columns=['x','y'])
x2D_df['word'] = w2v_model.wv.vocab.keys()

#Save the pandas dataframe as CSV file
#x2D_df.to_csv('./data/w2vec_gutemberg.csv', index=False)

In [25]:
# Display the first 10 rows
x2D_df.head(10)

Unnamed: 0,x,y,word
0,-46.282333,-32.949806,[
1,9.751561,-62.076302,Emma
2,27.670023,-6.666984,by
3,14.622907,-62.510429,Jane
4,-46.239613,-32.930584,]
5,-27.949333,-26.615395,I
6,-45.188671,-22.265228,CHAPTER
7,15.243722,-62.110664,Woodhouse
8,26.899984,-7.178734,","
9,-3.838312,-44.267666,handsome


In [26]:
output_notebook()

In [27]:
# Extract a sample. If you have a powerful computer you can display all 17,000
df = x2D_df.sample(n=17000)
plot = figure(plot_width=800, plot_height=800)
_ = plot.text(x=df.x, y=df.y, text=df.word)
show(plot)