# Creating Word Vectors with word2vec

###Load dependencies¶


In [0]:
# Install dependensies which are not present
!pip install -q gensim
!pip install -q bokeh

In [0]:
#NLTK
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg
#Gensim
import gensim
from gensim.models.word2vec import Word2Vec
#TSNE
from sklearn.manifold import TSNE
#Pandas
import pandas as pd
#Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [17]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###Load data

In [0]:
nltk.download('gutenberg')

In [0]:
from nltk.corpus import gutenberg

###Tokenize text

In [0]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

##Word2Vec

###Run Word2Vec

In [0]:
# quick tutorial
gensim.models.Word2Vec?

In [0]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

In [0]:
model.save('raw_gutenberg_model.w2v')

###Explore model

In [0]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [22]:
model['dog']

  """Entry point for launching an IPython kernel.


array([-0.06484351,  0.17654455, -0.13176975, -0.372054  ,  0.4388689 ,
       -0.18276522,  0.19195509,  0.47073245, -0.02077552,  0.07562044,
        0.53683525,  0.12396031,  0.23490392,  0.14997801, -0.05025299,
       -0.16480976, -0.16057798,  0.02594238, -0.04009009, -0.13306007,
       -0.11199772,  0.04397428,  0.4276845 , -0.05954873, -0.32771993,
       -0.2381715 ,  0.4826798 , -0.25736794, -0.20787771,  0.30663237,
        0.0660858 , -0.33547777, -0.6442158 , -0.30506593,  0.70039743,
        0.33785313,  0.52094483, -0.32767165, -0.503962  , -0.459166  ,
       -0.5951493 ,  0.06718218, -0.21808636, -0.35992804,  0.12250103,
        0.00531692, -0.12928498, -0.0825882 , -0.4501762 , -0.03234098,
       -0.23300287,  0.15217711, -0.21890673, -0.04121516, -0.0648862 ,
        0.04013623,  0.25584936, -0.5348273 , -0.25935918,  0.05815421,
       -0.17358312, -0.08283687,  0.19818085, -0.12579338], dtype=float32)

In [23]:
len(model['dog'])

  """Entry point for launching an IPython kernel.


64

In [24]:
model.most_similar('dog') # distance

  """Entry point for launching an IPython kernel.


[('puppy', 0.834839940071106),
 ('cage', 0.7787510752677917),
 ('sweeper', 0.7713398933410645),
 ('thief', 0.7617578506469727),
 ('shell', 0.7592639923095703),
 ('pet', 0.7566794157028198),
 ('pig', 0.7415035367012024),
 ('chimney', 0.7371214628219604),
 ('arrow', 0.736382007598877),
 ('lazy', 0.7329269647598267)]

In [25]:
model.most_similar('think')

  """Entry point for launching an IPython kernel.


[('suppose', 0.8776276111602783),
 ('contradict', 0.8514833450317383),
 ('manage', 0.8319798111915588),
 ('downright', 0.8241727352142334),
 ('_you_', 0.8201947808265686),
 ('NOW', 0.8155933618545532),
 ('shouldn', 0.8088677525520325),
 ('mean', 0.808617115020752),
 ('know', 0.8074365854263306),
 ('hesitate', 0.8074259757995605)]

In [26]:
model.most_similar('day')

  """Entry point for launching an IPython kernel.


[('morning', 0.7838103771209717),
 ('night', 0.7471582889556885),
 ('time', 0.7438251972198486),
 ('evening', 0.7110657691955566),
 ('month', 0.6936417818069458),
 ('week', 0.6797670125961304),
 ('feasting', 0.6656355261802673),
 ('fourteenth', 0.6622515916824341),
 ('Saturday', 0.6596876382827759),
 ('morrow', 0.6594432592391968)]

In [27]:
model.most_similar('father')

  """Entry point for launching an IPython kernel.


[('mother', 0.8671184778213501),
 ('brother', 0.8366609215736389),
 ('sister', 0.8067150712013245),
 ('Amnon', 0.7866572141647339),
 ('wife', 0.7817331552505493),
 ('daughter', 0.7758300304412842),
 ('bondwoman', 0.7470455169677734),
 ('uncle', 0.7369657158851624),
 ('Tamar', 0.7311744093894958),
 ('Dinah', 0.7291200160980225)]

In [33]:
model.doesnt_match("mother father daughter dog".split())

  """Entry point for launching an IPython kernel.


'dog'

In [43]:
model.similarity('father', 'dog')

  """Entry point for launching an IPython kernel.


0.4617500671486862

In [53]:
# close, but not quite; distinctly in female direction: 
model.most_similar(positive=['father', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('sister', 0.794893741607666),
 ('mother', 0.7737314701080322),
 ('daughter', 0.7525524497032166),
 ('husband', 0.7513513565063477),
 ('wife', 0.7485069036483765),
 ('Sarah', 0.723505973815918),
 ('Sarai', 0.7067758440971375),
 ('daughters', 0.6953800320625305),
 ('conceived', 0.6904692649841309),
 ('brother', 0.6887805461883545)]

In [43]:
# more confident about this one: 
model.most_similar(positive=['son', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('wife', 0.7495712041854858),
 ('Sarah', 0.7458727359771729),
 ('Leah', 0.7353693246841431),
 ('Sarai', 0.7271307110786438),
 ('daughter', 0.7268913984298706),
 ('conceived', 0.7240091562271118),
 ('Bethuel', 0.7214502692222595),
 ('Hagar', 0.7181791067123413),
 ('Bilhah', 0.7109566330909729),
 ('Nahor', 0.7065157890319824)]

In [44]:
model.most_similar(positive=['husband', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('wife', 0.7432506084442139),
 ('sister', 0.713078498840332),
 ('conceived', 0.6846432685852051),
 ('daughter', 0.6731592416763306),
 ('child', 0.6649952530860901),
 ('mother', 0.6500774621963501),
 ('maid', 0.6254923343658447),
 ('Amnon', 0.6235975027084351),
 ('Sarai', 0.6207785606384277),
 ('harlot', 0.6203770637512207)]

In [45]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=20)

  """Entry point for launching an IPython kernel.


[('Sarah', 0.7475117444992065),
 ('Rachel', 0.7210857272148132),
 ('Solomon', 0.7049829959869385),
 ('Pharaoh', 0.6984406113624573),
 ('Sarai', 0.6920511722564697),
 ('Bethuel', 0.6847283840179443),
 ('Leah', 0.6839938163757324),
 ('Laban', 0.6751530766487122),
 ('Padanaram', 0.6706967353820801),
 ('Hagar', 0.6694093346595764),
 ('Judah', 0.6636509895324707),
 ('Abram', 0.6612067818641663),
 ('Rebekah', 0.6609291434288025),
 ('Ephron', 0.6599161624908447),
 ('queen', 0.6539762020111084),
 ('Heth', 0.6535360813140869),
 ('damsel', 0.6526831388473511),
 ('birthright', 0.6513861417770386),
 ('servants', 0.6512434482574463),
 ('Mephibosheth', 0.6501590609550476)]

###Reduce word vector dimensionality with t-SNE

In [54]:
model.wv.vocab

{'[': <gensim.models.keyedvectors.Vocab at 0x7fcc7590eba8>,
 'Emma': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e4a8>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fcc7590ebe0>,
 'Jane': <gensim.models.keyedvectors.Vocab at 0x7fcc7590eb70>,
 ']': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e898>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e0f0>,
 'CHAPTER': <gensim.models.keyedvectors.Vocab at 0x7fcc7590eb00>,
 'Woodhouse': <gensim.models.keyedvectors.Vocab at 0x7fcc7590ea58>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7fcc7590ef28>,
 'handsome': <gensim.models.keyedvectors.Vocab at 0x7fcc7590ed30>,
 'clever': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e550>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e198>,
 'rich': <gensim.models.keyedvectors.Vocab at 0x7fcc7590ee48>,
 'with': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e7b8>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fcc7590e780>,
 'comfortable': <gensim.models.keyedvectors.Vocab at 0x7fcc

In [55]:
len(model.wv.vocab)

17011

In [56]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


In [0]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [0]:
# takes 20 min to execute
X_2d = tsne.fit_transform(X)

In [0]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [60]:
coords_df.head(20)

Unnamed: 0,x,y,token
0,-18.686007,-59.954014,[
1,14.619617,-60.505898,Emma
2,-49.323696,-30.863409,by
3,11.394951,-61.817005,Jane
4,-18.658031,-59.897961,]
5,33.780361,-15.946114,I
6,25.113699,68.715912,CHAPTER
7,10.955431,-61.725494,Woodhouse
8,-48.895615,-32.289482,","
9,23.210464,-36.210934,handsome


###Visualize 2D representation of word vectors



In [0]:
# subset words so that the code runs faster
subset_df = coords_df.sample(n=500)

In [1]:
_ = coords_df.plot.scatter(x=subset_df.x, y=subset_df.y, figsize=(12,12), marker='.', s=10, alpha=0.2)

NameError: ignored

###Visualize interactive representation of word vectors

In [64]:
output_notebook() # output bokeh plots inline in notebook
p = figure()
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
show(p)