## Lecture 08 - Word representation, Term-Term Matrix, word2vec

### Install package 

```pip install nltk```

```pip install gensim```

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords 
import nltk 


In [14]:
file = open("Data/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone",'r')
harrypotter_corpus = file.read()

### Document, Tokenization, Dictionary

In [10]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
print(stop_words)

{'which', 'doesn', 'very', "haven't", 'was', 'what', 'themselves', 'it', 'here', "you'll", 'being', 'they', 'to', 'again', 'before', 'yours', "isn't", 'herself', 'ain', 'theirs', 'because', 'hasn', "you've", "mightn't", 'no', 'any', 'for', 'hadn', 'shouldn', 'your', 'than', "wasn't", 'their', 'while', 'why', 'll', 'up', 'have', 'same', 'wasn', 'ours', 'if', 'mightn', 'wouldn', "you're", "shan't", 'an', 'i', 'itself', 'haven', 'were', 'out', "she's", 'himself', 'will', 'as', 'so', 'myself', 'm', "it's", "shouldn't", 'in', 'some', 'did', 'of', 'needn', 'who', 'once', 'are', 'both', 'and', 'couldn', 'its', 'aren', 'above', 'from', 'o', "couldn't", "that'll", 'down', 'on', 'ourselves', 'be', 'shan', 'just', 're', 'until', "didn't", 'weren', 'through', "doesn't", 'won', 'this', 'after', 've', "won't", 'a', 'our', 'how', 'been', 'the', 'his', 'between', 't', 'by', "don't", 'against', 'with', 'only', "aren't", 'isn', 'each', 'own', 'those', "mustn't", 'or', "you'd", 'off', 'over', 'don', 'has

In [15]:
#nltk.download('punkt')
word_tokens = wordpunct_tokenize(harrypotter_corpus)

In [16]:
word_tokens = [w.lower() for w in word_tokens] 
word_tokens = [w.lower() for w in word_tokens if not w in stop_words] 
word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 

In [17]:
word_tokens

['harry',
 'potter',
 'sorcerer',
 'stone',
 'chapter',
 'one',
 'boy',
 'lived',
 'mr',
 'mrs',
 'dursley',
 'number',
 'four',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly',
 'normal',
 'thank',
 'much',
 'last',
 'people',
 'expect',
 'involved',
 'anything',
 'strange',
 'mysterious',
 'hold',
 'nonsense',
 'mr',
 'dursley',
 'director',
 'firm',
 'called',
 'grunnings',
 'made',
 'drills',
 'big',
 'beefy',
 'man',
 'hardly',
 'neck',
 'although',
 'large',
 'mustache',
 'mrs',
 'dursley',
 'thin',
 'blonde',
 'nearly',
 'twice',
 'usual',
 'amount',
 'neck',
 'came',
 'useful',
 'spent',
 'much',
 'time',
 'craning',
 'garden',
 'fences',
 'spying',
 'neighbors',
 'dursleys',
 'small',
 'son',
 'called',
 'dudley',
 'opinion',
 'finer',
 'boy',
 'anywhere',
 'dursleys',
 'everything',
 'wanted',
 'also',
 'secret',
 'greatest',
 'fear',
 'somebody',
 'would',
 'discover',
 'think',
 'could',
 'bear',
 'anyone',
 'found',
 'potters',
 'mrs',
 'potter',
 'mrs',
 'dursley',
 '

In [18]:
len(word_tokens)

40762

In [19]:
vocab = set(word_tokens)

In [20]:
len(vocab)

5615

In [21]:
vocab

{'fixed',
 'defenseless',
 'rob',
 'treacle',
 'limp',
 'discussing',
 'contains',
 'born',
 'perhaps',
 'copyin',
 'perform',
 'belongs',
 'spoken',
 'difference',
 'breathe',
 'presenting',
 'parcel',
 'um',
 'godric',
 'carefully',
 'company',
 'dyeing',
 'shrank',
 'angel',
 'potato',
 'unluckily',
 'reliving',
 'ducks',
 'gettin',
 'mistake',
 'melt',
 'barrels',
 'post',
 'beware',
 'markings',
 'blocks',
 'eight',
 'gleaming',
 'express',
 'teapot',
 'continue',
 'perched',
 'trapdoor',
 'holidays',
 'highway',
 'today',
 'liquids',
 'speared',
 'hall',
 'sandwiches',
 'mystified',
 'walnut',
 'tenpin',
 'platters',
 'shack',
 'doughnuts',
 'muddy',
 'crime',
 'jewel',
 'bundles',
 'breeding',
 'sunday',
 'facing',
 'leaked',
 'self',
 'think',
 'defeated',
 'revenge',
 'expression',
 'shared',
 'twenty',
 'furling',
 'instinct',
 'zoo',
 'black',
 'ronald',
 'damaged',
 'ing',
 'sit',
 'wheeling',
 'betray',
 'cigarette',
 'goggle',
 'perfectly',
 'snoring',
 'animals',
 'cases

In [22]:
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))

In [23]:
char_to_int['harry']

5570

In [24]:
X = np.zeros((len(word_tokens), len(vocab)))
Y = np.array([])
Xwords=[]
Ywords=[]
window_size = 5
for i, word in enumerate(word_tokens):
    isetvalue=0
    w2v=np.zeros(len(vocab))
    for icontext in range(max(i-window_size,0), min(i+window_size, len(word_tokens)-1)+1):
        if icontext!=i:
            w2v[char_to_int[word_tokens[icontext]]]=w2v[char_to_int[word_tokens[icontext]]]+1
    X[i]=w2v

In [25]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
np.nonzero(X[0])

(array([ 783, 2178, 4100, 4133, 5505]),)

In [27]:
int_to_char[2034]

'curses'

In [28]:
X.shape

(40762, 5615)

In [29]:
word_tokens[0:10]

['harry',
 'potter',
 'sorcerer',
 'stone',
 'chapter',
 'one',
 'boy',
 'lived',
 'mr',
 'mrs']

In [30]:
colnames=[int_to_char[i] for i in range(len(vocab))]

In [31]:
word2vecrep=pd.DataFrame(X, columns=colnames)

In [32]:
word2vecrep["wordkey"]=word_tokens

In [33]:
word2vecrep["wordkey"]

0           harry
1          potter
2        sorcerer
3           stone
4         chapter
           ...   
40757         lot
40758         fun
40759      dudley
40760      summer
40761         end
Name: wordkey, Length: 40762, dtype: object

In [34]:
word2vecbycount=word2vecrep.groupby(['wordkey']).sum()

In [35]:
word2vecbycount

Unnamed: 0_level_0,fixed,defenseless,rob,treacle,limp,discussing,contains,born,perhaps,copyin,...,slugs,peace,ube,evil,dream,mainly,disgusting,seeming,tear,reports
wordkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaaaaaaaaargh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaargh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aargh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoomed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
word2vecbycount.loc["whirled", :]

fixed          0.0
defenseless    0.0
rob            0.0
treacle        0.0
limp           0.0
              ... 
mainly         0.0
disgusting     0.0
seeming        0.0
tear           0.0
reports        0.0
Name: whirled, Length: 5615, dtype: float64

In [37]:
word2vecbycount.loc["potter", :]

fixed          0.0
defenseless    0.0
rob            0.0
treacle        0.0
limp           0.0
              ... 
mainly         0.0
disgusting     0.0
seeming        0.0
tear           0.0
reports        0.0
Name: potter, Length: 5615, dtype: float64

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([np.array(word2vecbycount.loc["harry", :])], [np.array(word2vecbycount.loc["wand", :])])

NameError: name 'np' is not defined

In [39]:
cosine_similarity([np.array(word2vecbycount.loc["harry", :])], [np.array(word2vecbycount.loc["woodcroft", :])])

array([[0.0430042]])

In [40]:
for i in np.nonzero(X[0])[0]:
    print(colnames[i])
    print(X[0][i])

stone
1.0
potter
1.0
one
1.0
chapter
1.0
sorcerer
1.0


### Geinsim pretrained-word2vec methods

### pretrained word2vec model documentation
https://github.com/RaRe-Technologies/gensim-data

In [1]:
from gensim.models import KeyedVectors
import gensim.downloader as api
print(api.load("word2vec-google-news-300", return_path=True))
# When you use the Gensim download API, all data is stored in your `~/gensim-data` home folder.

[--------------------------------------------------] 1.4% 23.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=-------------------------------------------------] 3.3% 55.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[==------------------------------------------------] 5.2% 86.1/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 7.3% 120.6/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[====----------------------------------------------] 9.1% 151.9/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=====---------------------------------------------] 10.9% 181.8/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



/home/dlai/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [5]:
model = KeyedVectors.load_word2vec_format('/home/dlai/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', binary=True)

In [6]:
len(model["finance"])

300

In [7]:
print(model.similarity('allergy', 'allergic'))
print(model.similarity('harry', 'potter'))
print(model.similarity('water', 'fluid'))
print(model.similarity('finance', 'financial'))

0.71326107
0.08543932
0.34774813
0.49127555


### Word Relationship
#### $\vec{best}-\vec{good} \sim \vec{worst}-\vec{bad}$
#### $\vec{best}-\vec{good}+\vec{bad} \sim \vec{worst}$

In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
v1=np.array(model['best']-model['good']+model['bad'])
v2=np.array(model['worst'])
cosine_similarity([v1], [v2])

array([[0.6791442]], dtype=float32)