<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Embeddings/word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from IPython.display import clear_output, Image

In [4]:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt
clear_output()

In [5]:
import bokeh
import bokeh.models as bm
import bokeh.plotting as pl
import gensim
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd

from bokeh.io import output_notebook
from gensim.models import Word2Vec
from nltk.tokenize import WordPunctTokenizer as WPT
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

# Bag of words

In [None]:
def splitme(doc):
  return [txt.lower().split() for txt in doc]


In [None]:
docs = ["I love pizza", "I like sushi", "I like pizza", "I enjoy beef"]
tk_docs = splitme(docs)
voc = list(set([word for doc in tk_docs for word in doc]))

wv = []
for d in tk_docs:
  vector = [d.count(word) for word in voc]
  wv.append(vector)

for i, word in enumerate(docs):
  #print(f"doc {i+1}:, {wv[i]}")
  print(f'doc{i+1}: {word} - {wv[i]}')

doc1: I love pizza - [0, 1, 0, 1, 0, 0, 1]
doc2: I like sushi - [0, 0, 0, 0, 1, 1, 1]
doc3: I like pizza - [0, 0, 0, 1, 1, 0, 1]
doc4: I enjoy beef - [1, 0, 1, 0, 0, 0, 1]


In [None]:
df = pd.DataFrame(wv, columns = voc)
df

Unnamed: 0,enjoy,love,beef,pizza,like,sushi,i
0,0,1,0,1,0,0,1
1,0,0,0,0,1,1,1
2,0,0,0,1,1,0,1
3,1,0,1,0,0,0,1


## Load data

In [6]:
data = list(open("quora.txt"))
data[666]

'How damning are the recent DNC leaks that show internal collusion?\n'

# Tokenize data

In [7]:
tokenizer = WPT()
print(tokenizer.tokenize(data[1]))

['What', 'are', 'some', 'ways', 'to', 'overcome', 'a', 'fast', 'food', 'addiction', '?']


In [9]:
new_data = [tokenizer.tokenize(x.lower()) for x in data]
new_data[3]

['what', 'are', 'zip', 'codes', 'in', 'the', 'bay', 'area', '?']

### Bathe in tests

In [None]:
assert all(isinstance(row, list) for row in new_data)
assert all(all(isinstance(token, str) for token in row) for row in new_data)
is_latin = lambda token: all('a' <= x.lower() <= 'z' for x in token)
assert all(map(lambda l: not is_latin(l) or l.islower(), map(' '.join, new_data)))

# Word2Vec

In [10]:
vectorized = Word2Vec(new_data, window = 5, min_count=5, vector_size=32)

In [None]:
vectorized.wv.get_vector('cat')

array([-1.4237214 ,  1.4328684 ,  0.08061663, -1.1621952 ,  1.4041325 ,
        1.5938008 , -2.8783095 , -2.7338724 , -0.98583126,  2.5502975 ,
        1.6452942 , -0.5041936 , -2.7520492 ,  2.3279576 ,  0.96513   ,
       -3.5300837 , -3.0197961 , -0.8883868 , -2.7285845 , -1.502072  ,
        1.6133738 ,  4.4917355 , -0.25572735, -0.42647457, -0.07298391,
        1.0577344 , -0.53869456, -0.24102525, -1.4842737 , -0.16666985,
        2.1852844 ,  0.0678091 ], dtype=float32)

In [None]:
vectorized.wv.vectors

array([[-9.4181556e-01,  1.0615714e+00,  7.9872257e-01, ...,
        -5.1965404e-01, -1.1165336e+00,  6.4052618e-01],
       [ 8.4379539e-02,  3.0592744e+00,  2.2173982e+00, ...,
         1.6768326e+00, -2.0142691e+00,  7.6431537e-01],
       [ 3.0028892e+00,  3.4570889e+00, -7.6673150e-01, ...,
        -4.4761577e+00, -2.4556080e-01,  3.0267870e-01],
       ...,
       [-6.0564321e-02, -1.3258700e-01,  1.6244468e-01, ...,
        -1.2743399e-01, -7.9805180e-02,  7.0498928e-02],
       [-9.4427891e-02, -6.4844862e-02,  6.1839018e-02, ...,
        -1.3591753e-01, -9.5372342e-02,  2.0139792e-03],
       [-4.3899272e-02, -2.4576945e-02,  5.5743452e-02, ...,
        -1.0063543e-01, -5.8383248e-03,  9.8824650e-02]], dtype=float32)

In [None]:
list(zip(vectorized.wv.key_to_index.items()))[1030:1045]

[(('cards', 1030),),
 (('mathematics', 1031),),
 (('easily', 1032),),
 (('photo', 1033),),
 (('suitable', 1034),),
 (('front', 1035),),
 (('12th', 1036),),
 (('may', 1037),),
 (('early', 1038),),
 (('visiting', 1039),),
 (('baby', 1040),),
 (('treat', 1041),),
 (('building', 1042),),
 (('formula', 1043),),
 (('gets', 1044),)]

## **CBoW** and **Skip-Gram** methods of Word2Vec

Word2Vec can be implemented using **Continuous Bag of Words** which is similar to One-hot-encoding and *in* process of *training* **takes context words** and tries to **predict the target word**. This **method** is **default** in Gensim lib. Fast and efficient on smaller datasets.

Another implementation is **Skip-Gram** which** takes a target word** and tries to generate\ **predict context words** surrounding it. More precise but much slower, fits larger datasets. (sg=1) parameter.

In [15]:
# source: https://arxiv.org/pdf/1309.4168v1.pdf
Image(url = "https://miro.medium.com/v2/resize:fit:828/format:webp/1*cuOmGT7NevP9oJFJfVpRKA.png")

In [11]:
text1 = 'the kid said he would be a superman'
text2 = 'the child said he would be a superman'

In [12]:
text1_emb = [vectorized.wv[word] for word in text1.split()]
text2_emb = [vectorized.wv[word] for word in text2.split()]

In [13]:
def diff_w2v(txts, method=0):
  print(f"{'Skip-Gram' if method else 'CBoW'} method:")
  w2v = Word2Vec(new_data, window = 5, min_count=5, vector_size=32, sg = method)
  texts = []
  for doc in txts:
    texts.append([w2v.wv[word] for word in doc.split()])
  for sentence in texts:
    for word in sentence:
      print(w2v.wv.most_similar(word)[0][0], end =' ')
    print()
  return w2v

In [14]:
cbow = diff_w2v([text1, text2])
print()
sg = diff_w2v([text1, text2], 1)

CBoW method:
the kid said he would be a superman 
the child said he would be a superman 

Skip-Gram method:
the kid said he would be a superman 
the child said he would be a superman 


The result is the same.

CBoW method - Skip-Gram method difference in vectors

In [27]:
cbow.wv.get_vector('cat') - sg.wv.get_vector('cat')

array([-2.688287  ,  2.5203018 , -1.7941455 , -0.7318937 ,  0.06231423,
        1.3946053 , -1.1891434 ,  0.39873436, -0.7379125 ,  1.783877  ,
        0.9997353 , -0.30883855,  0.2988268 ,  2.7872539 ,  1.4499317 ,
       -1.1266468 , -3.0114684 , -0.00608536, -1.9513954 , -3.0764132 ,
        2.0152986 ,  2.5789902 , -2.572105  ,  0.1430189 , -0.49150473,
       -1.2039481 , -0.9966858 , -1.4220076 , -2.9390006 ,  1.3437202 ,
        1.0135946 , -0.18805218], dtype=float32)

That of course doesn't mean anything since the seeds are not set and each time model trains differently. Here is a proof.

Two instances of CBoW W2V on the same dataset

In [28]:
vectorized.wv.get_vector('cat') - cbow.wv.get_vector('cat')

array([ 0.09271264, -2.0827785 ,  0.61792064,  0.7394627 ,  0.6140085 ,
        0.0437355 ,  1.2016311 , -2.4036443 , -0.08802891,  0.58001804,
        0.16731197, -1.6987114 ,  0.29057708, -0.786651  , -0.25174356,
       -0.14840078, -0.7491517 , -0.19493616,  1.2210094 , -0.57829285,
       -1.2140036 , -3.5481782 , -1.3250527 ,  0.11458287,  1.4745449 ,
        2.800397  , -0.14703143,  0.5518914 ,  0.4548092 ,  1.0163724 ,
       -0.9245418 , -0.85292244], dtype=float32)

Well, we get the same embeddings but the vectors are of course different

## Bag of Embeddings

In [15]:
doc_1 = "I love cats"
doc_2 = "I love dogs"
doc_3 = "I like cats and dogs"

In [16]:
sent1 = "The idea is not terrible we shall use it"
sent2 = "The idea is terrible we shall not use it"

In [17]:
docs = [doc_1, doc_2, doc_3, sent1, sent2]

### Straightforward BoE, no averaging

In [None]:
def boe(docs = docs):
  embeds = []

  for sent in docs:
    doc_vec = []
    for word in sent.lower().split():
      if word in vectorized.wv.key_to_index:
        doc_vec.append(vectorized.wv[word])
    embeds.append(doc_vec)
  return embeds

In [None]:
boe_vecs = boe()

#### Vectorized word "cats"

In [None]:
boe_vecs[0][2]

array([-2.047111  , -0.24919967, -2.2889216 ,  0.6348777 ,  2.5309365 ,
       -0.09209277,  0.59489864, -0.9968526 , -0.6734233 ,  0.910514  ,
        0.22394092,  0.00663299,  2.1584628 , -0.78183144,  0.67779213,
       -0.43562964,  0.5285604 , -4.0064297 , -1.6233991 ,  0.6897092 ,
        0.95589834,  2.137322  ,  0.94452566, -0.82744163,  0.64450896,
       -0.58000875, -0.8861219 , -1.2161727 ,  0.06213524,  0.27627876,
       -1.7352341 ,  0.49162784], dtype=float32)

#### Reverse vectors to words

In [None]:
for document in boe_vecs:
  for wrd in document:
    print(vectorized.wv.most_similar(wrd)[0][0], end = ' ')
  print()

i love cats 
i love dogs 
i like cats and dogs 
the idea is not terrible we shall use it 
the idea is terrible we shall not use it 


### Average BoE

Since we use cosine similarity it could just be a sum of vectors as well

In [18]:
def avg_boe(w2v=vectorized, docs = docs):
  embeds = []

  for sent in docs:
    doc_vec = np.zeros(vectorized.wv.vector_size)
    #n_w_in_sent = 0
    for word in sent.lower().split():
      if word in w2v.wv.key_to_index:
        #n_w_in_sent+=1
        doc_vec += w2v.wv[word]
    embeds.append(doc_vec)# /n_w_in_sent) Mean doesnt matter, sum would do too
  return embeds

Let's take a look how averaging of sentence words works on two CBoW with no frozen seeds and one Skip-Gram W2V instances.

In concept taking the mean vectors of all words in the sentence should give us top10 words that summarize context of a sentence.

CBoW instance 1

In [60]:
avg_boevecs = avg_boe()
vectorized.wv.most_similar(avg_boevecs[0])

[('depressed', 0.8007091879844666),
 ('girlfriend', 0.7674483060836792),
 ('myself', 0.7512775659561157),
 ('friends', 0.7489374876022339),
 ('boyfriend', 0.7465841174125671),
 ('feelings', 0.7365669012069702),
 ('feeling', 0.7320765256881714),
 ('love', 0.7303024530410767),
 ('i', 0.727887749671936),
 ('happy', 0.7214086651802063)]

CBoW instance 2

In [19]:
avg_boe_cbow = avg_boe(cbow)
cbow.wv.most_similar(avg_boe_cbow[0])

[('depressed', 0.777712345123291),
 ('friends', 0.771709680557251),
 ('girlfriend', 0.7509831786155701),
 ('feelings', 0.7493012547492981),
 ('i', 0.7398567199707031),
 ('boyfriend', 0.7356587052345276),
 ('love', 0.7345700263977051),
 ('feeling', 0.7324872016906738),
 ('myself', 0.7265275120735168),
 ('bored', 0.7233301997184753)]

Skip-Gram

In [20]:
avg_boe_sg = avg_boe(sg)
sg.wv.most_similar(avg_boe_sg[0])

[('eachother', 0.8919486999511719),
 ('homesick', 0.8830142021179199),
 ('madly', 0.87704998254776),
 ('coward', 0.8766603469848633),
 ('articulate', 0.8756245970726013),
 ('heartbroken', 0.8754575252532959),
 ('fantasizing', 0.8747760057449341),
 ('drifting', 0.8707690834999084),
 ('obsessing', 0.8698922395706177),
 ('pity', 0.869583249092102)]

Well, that was unexpected. The sum(or mean) of vectors in a sentence "I love cats" outputs "depressed" as most similar by meaning (think cos angle) to the sentence.

Also notice that Skip-Gram trained model shows far more different (and with more confidence) result that CBoW models compared to each other.

### Problems

The problem with Word2Vec is that it takes the word order into account only to some extent.

> Word2Vec is not bad, we shall use it.

and

> Word2Vec is bad, we shall not use it.

Would *likely* give similar result in vectors.  

In [61]:
for sentence in range(len(docs)):
  m, p = vectorized.wv.most_similar(avg_boevecs[sentence])[0]
  print(f'{docs[sentence]}: "{m}" prob: {round(p,5)}')

I love cats: "depressed" prob: 0.80071
I love dogs: "depressed" prob: 0.78933
I like cats and dogs: "sick" prob: 0.74592
The idea is not terrible we shall use it: "it" prob: 0.80901
The idea is terrible we shall not use it: "it" prob: 0.80901


CBoW

In [74]:
cbow_idea = avg_boe(docs = [sent.lower() for sent in [sent1, sent2]])
cbow_idea[0] - cbow_idea[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Skip-Gram

In [75]:
sg_idea = avg_boe(sg, docs = [sent.lower() for sent in [sent1, sent2]])
sg_idea[0] - sg_idea[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

So here is a proof - it's not *likely*, it's a **fact**, judging by the zero difference of vectors in terms of context the order of words in not taken into account in both CBoW and Skip-Gram methods.

### TF-IDF for weighted average BoE

#### Term frequency

In [50]:
def get_tf(documents):
  tf = []
  for doc in documents:
    doc_tf = {}
    print(doc)
    for word in doc.split():
      if word in doc_tf.keys():
        doc_tf[word]+=1
      else:
        doc_tf[word] = 1
    tf.append(doc_tf)
  return tf

In [25]:
tf = get_tf(docs)
tf

[{'I': 1, 'love': 1, 'cats': 1},
 {'I': 1, 'love': 1, 'dogs': 1},
 {'I': 1, 'like': 1, 'cats': 1, 'and': 1, 'dogs': 1},
 {'The': 1,
  'idea': 1,
  'is': 1,
  'not': 1,
  'terrible': 1,
  'we': 1,
  'shall': 1,
  'use': 1,
  'it': 1},
 {'The': 1,
  'idea': 1,
  'is': 1,
  'terrible': 1,
  'we': 1,
  'shall': 1,
  'not': 1,
  'use': 1,
  'it': 1}]

#### Document frequency

In [27]:
def get_df(tf):
  df = {}
  for doc in tf:
    for word in doc.keys():
      if word in df:
        df[word] += 1
      else:
        df[word] = 1
  return df

In [28]:
doc_freq = get_df(tf)
doc_freq

{'I': 3,
 'love': 2,
 'cats': 2,
 'dogs': 2,
 'like': 1,
 'and': 1,
 'The': 2,
 'idea': 2,
 'is': 2,
 'not': 2,
 'terrible': 2,
 'we': 2,
 'shall': 2,
 'use': 2,
 'it': 2}

#### Inverse document frequency

In [49]:
def get_idf(documents, doc_freq):
  idf = {}
  for word, freq in doc_freq.items():
    idf[word] = np.log(len(documents) / (freq+1))
  return idf

In [31]:
inv_doc_freq = get_idf(docs, doc_freq)
inv_doc_freq

{'I': 0.22314355131420976,
 'love': 0.5108256237659907,
 'cats': 0.5108256237659907,
 'dogs': 0.5108256237659907,
 'like': 0.9162907318741551,
 'and': 0.9162907318741551,
 'The': 0.5108256237659907,
 'idea': 0.5108256237659907,
 'is': 0.5108256237659907,
 'not': 0.5108256237659907,
 'terrible': 0.5108256237659907,
 'we': 0.5108256237659907,
 'shall': 0.5108256237659907,
 'use': 0.5108256237659907,
 'it': 0.5108256237659907}

We already see that values that are encountered 2 times are about 51% "valiable", "I" is in 3 doc so it's value is low, and there was only one instance each of "like" and "and" in ne document so their values are 0.91%

#### **T**erm **F**requency **- I**nverse **D**ocument **F**requency

In [34]:
def calc_tfidf(tf, inv_doc_freq):
  tfidf = []
  for words in tf:
    tfidf_doc = {}
    for word, freq in words.items():
      tfidf_doc[word] = freq * inv_doc_freq[word]
    tfidf.append(tfidf_doc)
  return tfidf

In [35]:
calc_tfidf(tf, inv_doc_freq)

[{'I': 0.22314355131420976,
  'love': 0.5108256237659907,
  'cats': 0.5108256237659907},
 {'I': 0.22314355131420976,
  'love': 0.5108256237659907,
  'dogs': 0.5108256237659907},
 {'I': 0.22314355131420976,
  'like': 0.9162907318741551,
  'cats': 0.5108256237659907,
  'and': 0.9162907318741551,
  'dogs': 0.5108256237659907},
 {'The': 0.5108256237659907,
  'idea': 0.5108256237659907,
  'is': 0.5108256237659907,
  'not': 0.5108256237659907,
  'terrible': 0.5108256237659907,
  'we': 0.5108256237659907,
  'shall': 0.5108256237659907,
  'use': 0.5108256237659907,
  'it': 0.5108256237659907},
 {'The': 0.5108256237659907,
  'idea': 0.5108256237659907,
  'is': 0.5108256237659907,
  'terrible': 0.5108256237659907,
  'we': 0.5108256237659907,
  'shall': 0.5108256237659907,
  'not': 0.5108256237659907,
  'use': 0.5108256237659907,
  'it': 0.5108256237659907}]

#### Combine together

In [37]:
def get_tfidf(documents):
  tf = get_tf(documents)
  df = get_df(tf)
  idf = get_idf(documents, df)
  tfidf = calc_tfidf(tf, idf)
  return tfidf

In [45]:
txt1 = 'The quick brown fox jumps over the lazy dog'
txt2 = 'Richard of York gave the battle in vain'
txt3 = 'She does teach torches to shine bright'
txt4 = 'Evening is the best part of the day'
txt5 = "It is just a flesh wound"
txt6 =  'I know that i know nothing'

texts = [txt1, txt2, txt3, txt4, txt5, txt6]


['The quick brown fox jumps over the lazy dog', 'Richard of York gave the battle in vain', 'She does teach torches to shine bright', 'Evening is the best part of the day', 'It is just a flesh wound', 'I know that i know nothing']


In [51]:
get_tfidf(texts)

The quick brown fox jumps over the lazy dog
Richard of York gave the battle in vain
She does teach torches to shine bright
Evening is the best part of the day
It is just a flesh wound
I know that i know nothing


[{'The': 1.0986122886681098,
  'quick': 1.0986122886681098,
  'brown': 1.0986122886681098,
  'fox': 1.0986122886681098,
  'jumps': 1.0986122886681098,
  'over': 1.0986122886681098,
  'the': 0.4054651081081644,
  'lazy': 1.0986122886681098,
  'dog': 1.0986122886681098},
 {'Richard': 1.0986122886681098,
  'of': 0.6931471805599453,
  'York': 1.0986122886681098,
  'gave': 1.0986122886681098,
  'the': 0.4054651081081644,
  'battle': 1.0986122886681098,
  'in': 1.0986122886681098,
  'vain': 1.0986122886681098},
 {'She': 1.0986122886681098,
  'does': 1.0986122886681098,
  'teach': 1.0986122886681098,
  'torches': 1.0986122886681098,
  'to': 1.0986122886681098,
  'shine': 1.0986122886681098,
  'bright': 1.0986122886681098},
 {'Evening': 1.0986122886681098,
  'is': 0.6931471805599453,
  'the': 0.8109302162163288,
  'best': 1.0986122886681098,
  'part': 1.0986122886681098,
  'of': 0.6931471805599453,
  'day': 1.0986122886681098},
 {'It': 1.0986122886681098,
  'is': 0.6931471805599453,
  'just': 

## Cos similarity (most_similar)

Cosine similary of the words is  
1. get two words in vector interpretation
2. Normalize them
3. Get their dot product
4. $cos\ similarity = \frac{word\_1\ @\ word\_2}{len(word\_1)\ *\  len(word\_2)}$
5. 1 means identical vectors, -1 - opposite, 0 -  no similarity\orthogonal directions (no shared context)

In [None]:
vectorized.wv.most_similar('faux')

[('pas', 0.983573853969574),
 ('extracurricular', 0.7581830620765686),
 ('cultural', 0.7327991127967834),
 ('traditions', 0.7185940146446228),
 ('appropriation', 0.7176169157028198),
 ('etiquette', 0.7153453826904297),
 ('pitfalls', 0.713373601436615),
 ('norms', 0.6962264180183411),
 ('brutality', 0.6933082342147827),
 ('criticisms', 0.6926724910736084)]

# Pre-trained embedding model

In [None]:
api.info() #ruscorpora-300

In [None]:
model = api.load('glove-twitter-100') #weights ~400mb

In [None]:
model.most_similar(["game", "multiplayer"], negative = ["fun"])

[('gameplay', 0.7124838829040527),
 ('crysis', 0.6885968446731567),
 ('dishonored', 0.683879554271698),
 ('battlefield', 0.6811695694923401),
 ('fps', 0.6733326315879822),
 ('rpg', 0.6518135070800781),
 ('starcraft', 0.6491492986679077),
 ('mmorpg', 0.642058253288269),
 ('simcity', 0.6411633491516113),
 ('dlc', 0.6395991444587708)]

In [None]:
words = model.index_to_key[:1000]
print(words)

### Get vectorized words

In [None]:
words_array = np.array([model[word] for word in words])

### Bathe in tests

In [None]:
assert isinstance(words_array, np.ndarray)
assert words_array.shape == (len(words), 100)
assert np.isfinite(words_array).all()

## Use PCA and normalize. reduce to two dim

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(words_array)

scaler = StandardScaler()
pca_normalized = scaler.fit_transform(components)

### Bathe in tests

In [None]:
assert pca_normalized.shape == (len(words_array), 2), "there must be a 2d vector for each word"
assert max(abs(pca_normalized.mean(0))) < 1e-5, "points must be zero-centered"
assert max(abs(1.0 - pca_normalized.std(0))) < 1e-2, "points must have unit variance"

### Draw. (Simply copied snippet)

In [None]:
output_notebook() #bokeh.io
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

draw_vectors(pca_normalized[:, 0], pca_normalized[:, 1], token=words)

## t-SNE (t-distributed Stochastic neighbour embedding)

In [None]:
tsne = TSNE(n_components=2)
tsne_result  = tsne.fit_transform(words_array)

In [None]:
draw_vectors(tsne_result[:, 0], tsne_result[:, 1], color='green', token=words)

## Process phrases

In [None]:
ZEROS=0

In [None]:
def embed_phrase(model, phrase: str, dataset: bool=True):
  """
  args:
    model: - word embedding model
    phrase: str - text phrase
  process:
    1. split, lowercase
    2. tokenize
    3. avg word vectors for all words in phrase
    skip words unknown to model
    if none of the words in model return zeros

  """
  vectors = []
  for token in tokenizer.tokenize(phrase.lower()):
    if token in model.key_to_index:
      vectors.append(model.get_vector(token))
    else:
      pass
      #print(token, "not in model vocabulary")
  if len(vectors) == 0:
    print('Unknown phrase', phrase)
    if dataset:
      global ZEROS      #absurdly inelegant decision but test require that
      ZEROS+=1
    return np.zeros([model.vector_size], dtype='float32')

  return np.mean(vectors, axis=0)

In [None]:
embed_phrase(model, 'jksjkskj shssj')

Unknown phrase jksjkskj shssj


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

###Moar tests

In [None]:
vector = embed_phrase(model, "I'm very sure. This never happened to me before... gjkdfs jnvksc ")
assert np.allclose(vector[::10],
                   np.array([ 0.31807372, -0.02558171,  0.0933293 , -0.1002182 , -1.0278689 ,
                             -0.16621883,  0.05083408,  0.17989802,  1.3701859 ,  0.08655966],
                              dtype=np.float32))

... not in model vocabulary
gjkdfs not in model vocabulary
jnvksc not in model vocabulary


In [None]:
chosen_phrases = data[::len(data) // 1000]
phrase_vectors = np.array([embed_phrase(model, vec) for vec in chosen_phrases])

### Pour some tests on meee

In [None]:
assert isinstance(phrase_vectors, np.ndarray) and np.isfinite(phrase_vectors).all()
assert phrase_vectors.shape == (len(chosen_phrases), model.vector_size)

## Visualize phrases

In [None]:
phrase_vectors_2d = TSNE().fit_transform(phrase_vectors)

phrase_vectors_2d = (phrase_vectors_2d - phrase_vectors_2d.mean(axis=0)) / phrase_vectors_2d.std(axis=0)

draw_vectors(phrase_vectors_2d[:, 0], phrase_vectors_2d[:, 1],
             phrase=[phrase[:50] for phrase in chosen_phrases],
             radius=20)

## KNN "similar questions" engine

In [None]:
ZEROS=0
data_vectors = np.array([embed_phrase(model, phrase) for phrase in data])

Unknown phrase AOSDHIADSOIHADSO DASODASHDASOH

Unknown phrase parisflatlist



In [None]:
#datavectors = np.array([x for x in data_vectors if np.any(x !=0)])
#seems like this is not a solution, i'd still just exclude the unknown values
#since all zeros mean maximum cosine similarity

In [None]:
data_vectors[35227]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

Wow, that took a while

In [None]:
def k_nearest(model, phrase, k=10, zeros=ZEROS):
  vector = embed_phrase(model, phrase, dataset=False)
  #cosine similarities each row of phrases * phrase (sum of multiplied vectors by row)
  #divided by / element wise multipliation of  product of L2 (Eucledian) norms by cols
  #by same l2 norm for the  entire vector array (which is scalar)
  similarity_scores = np.dot(data_vectors, vector) / (np.linalg.norm(data_vectors, axis=1) * np.linalg.norm(vector))
  #sort cos similarities (ascending order), take last K indices and reverse them
  top_idx = np.argsort(similarity_scores)[-k-zeros:-zeros][::-1]
  #select resulting indices from the original dataset
  top_phrases = [data[i] for i in top_idx]
  return top_phrases

In [None]:
results = k_nearest(model, "How do i enter the matrix?", k=10)

print(''.join(results))

assert len(results) == 10 and isinstance(results[0], str)
assert results[0] == 'How do I get to the dark web?\n'
assert results[3] == 'What can I do to save the world?\n'

How do I get to the dark web?
What should I do to enter hollywood?
How do I use the Greenify app?
What can I do to save the world?
How do I win this?
How do I think out of the box? How do I learn to think out of the box?
How do I find the 5th dimension?
How do I use the pad in MMA?
How do I estimate the competition?
What do I do to enter the line of event management?



  similarity_scores = np.dot(data_vectors, vector) / (np.linalg.norm(data_vectors, axis=1) * np.linalg.norm(vector))
