# Creating word vectors with Word2Vec

We will be creating word vectors

#### Load Dependencies

In [6]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/birbalsrivastava/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Load Data

In [9]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/birbalsrivastava/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [11]:
from nltk.corpus import gutenberg

In [13]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [14]:
len(gutenberg.fileids())

18

#### Tokenize text

In [17]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [23]:
gberg_sent_tokens[0:2]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."]

In [24]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [25]:
gberg_sents = gutenberg.sents()

In [27]:
gberg_sents[4][14]

'father'

In [30]:
len(gutenberg.words())

2621613

#### Run word2vec

In [31]:
model = Word2Vec(sentences = gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

In [32]:
model.save('raw_gutenberg_model.w2v')

#### Explore model

In [33]:
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [34]:
model['dog']

  """Entry point for launching an IPython kernel.


array([ 0.65224665, -0.31678504, -0.07444256, -0.40651053, -0.16762032,
       -0.00773732, -0.44117558, -0.22974676,  0.3016187 ,  0.22161503,
        0.43173245,  0.08064608, -0.7999007 , -0.1770745 , -0.02554084,
       -0.12089121, -0.5563962 , -0.11688837, -0.14740464, -0.38032556,
        0.01446535, -0.09058346,  0.23781194, -0.36899292, -0.6001382 ,
        0.02282212, -0.24569364, -0.24811836,  0.06040919,  0.20634122,
       -0.21923754, -0.07506583,  0.13806766, -0.5178154 ,  0.05768376,
       -0.00282938, -0.1656269 ,  0.21414304, -0.33688122, -0.10461961,
        0.35289264,  0.05676689, -0.2482115 ,  0.27623287,  0.21483386,
        0.33606765, -0.10766644,  0.81932056, -0.05996163, -0.02806928,
       -0.23998223, -0.0909708 , -0.00812618,  0.05047794, -0.1794465 ,
        0.48946762, -0.10201432, -0.1869752 ,  0.17972066,  0.05354273,
        0.07621563, -0.123419  ,  0.2046497 ,  0.00495556], dtype=float32)

In [35]:
len(model['dog'])

  """Entry point for launching an IPython kernel.


64

In [36]:
model.most_similar('dog')

  """Entry point for launching an IPython kernel.


[('puppy', 0.8442329168319702),
 ('sweeper', 0.7741421461105347),
 ('cage', 0.7686111927032471),
 ('shell', 0.7663735747337341),
 ('thief', 0.7649223208427429),
 ('pig', 0.756996750831604),
 ('broth', 0.7555265426635742),
 ('pet', 0.7516772747039795),
 ('Gingerbread', 0.7444334030151367),
 ('lazy', 0.7418845891952515)]

In [39]:
model.most_similar('think')

  """Entry point for launching an IPython kernel.


[('suppose', 0.8728477954864502),
 ('contradict', 0.8422753810882568),
 ('manage', 0.8389516472816467),
 ('downright', 0.8242859244346619),
 ('NOW', 0.8136323094367981),
 ('_you_', 0.8083164095878601),
 ('know', 0.8035281896591187),
 ('awfully', 0.8032065033912659),
 ('interfere', 0.8003072738647461),
 ('Mamma', 0.799087405204773)]

In [40]:
model.most_similar('day')

  """Entry point for launching an IPython kernel.


[('morning', 0.7747442722320557),
 ('night', 0.738493800163269),
 ('time', 0.7308229804039001),
 ('evening', 0.7160974144935608),
 ('month', 0.7152169942855835),
 ('sabbath', 0.6867563724517822),
 ('fourteenth', 0.6859608888626099),
 ('feasting', 0.6761636137962341),
 ('week', 0.6731440424919128),
 ('morrow', 0.6665116548538208)]

In [42]:
model.doesnt_match("mother father brother cat".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cat'

In [44]:
model.most_similar(positive=["father", "woman"], negative=["man"])

  """Entry point for launching an IPython kernel.


[('mother', 0.7920389175415039),
 ('sister', 0.7764179706573486),
 ('daughter', 0.7551136016845703),
 ('wife', 0.7511124610900879),
 ('husband', 0.7485218048095703),
 ('Sarai', 0.7180959582328796),
 ('Sarah', 0.7158210277557373),
 ('brother', 0.7070860862731934),
 ('daughters', 0.6878156065940857),
 ('child', 0.6825425624847412)]

In [45]:
model.most_similar(positive=['king', "woman"], negative=["man"], topn=30)

  """Entry point for launching an IPython kernel.


[('Sarah', 0.7395731210708618),
 ('Rachel', 0.7066898941993713),
 ('Sarai', 0.6987992525100708),
 ('Abram', 0.6860909461975098),
 ('Hagar', 0.6850262880325317),
 ('Pharaoh', 0.6844435334205627),
 ('Solomon', 0.6767655611038208),
 ('Leah', 0.6763573884963989),
 ('Laban', 0.6728417873382568),
 ('Bethuel', 0.6716956496238708),
 ('queen', 0.6657727956771851),
 ('princes', 0.6612876653671265),
 ('daughter', 0.6496058702468872),
 ('Rebekah', 0.6486358642578125),
 ('tribute', 0.6478548645973206),
 ('Judah', 0.6468213200569153),
 ('damsel', 0.6443588733673096),
 ('Uriah', 0.6427338123321533),
 ('Bilhah', 0.6386175155639648),
 ('Ephron', 0.6378081440925598),
 ('Padanaram', 0.6371746063232422),
 ('birthright', 0.6354619264602661),
 ('Hittite', 0.6347178816795349),
 ('Babylon', 0.6344143152236938),
 ('Zilpah', 0.6339958310127258),
 ('Mephibosheth', 0.6311028003692627),
 ('Esau', 0.6310945749282837),
 ('David', 0.6283500790596008),
 ('servants', 0.6270896196365356),
 ('Hamor', 0.6215534210205078)]

#### Reduce word2vec dimensionality with TSNE

In [46]:
len(model.wv.vocab)

17011