## word2vec: How to Implement word2vec

### Explore Pre-trained Embeddings

Some other options:
   * glove-twitter-{25/50/100/200}
   * glove-wiki-gigaword-{50/200/300}
   * word2vec-google-news-300
   * word2vec-ruscorpora-news-300

In [1]:
# Install gensim

# !pip install -U gensim

In [2]:
# load pretrained word vectors using gensim
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [3]:
# Explore the word vector 'king'
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [4]:
# Find the word similar to king based on the trained word vectors

wiki_embeddings.most_similar('queen')

[('princess', 0.7947245240211487),
 ('king', 0.7507690787315369),
 ('elizabeth', 0.7355712652206421),
 ('royal', 0.7065026760101318),
 ('lady', 0.7044796943664551),
 ('victoria', 0.6853758096694946),
 ('monarch', 0.6683257818222046),
 ('crown', 0.6680562496185303),
 ('prince', 0.6640505790710449),
 ('consort', 0.6570538282394409)]

### Train Our Own Model

In [5]:
# Read in the data and clean up columns names
import gensim
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 100)

In [6]:
messages = pd.read_csv('data/spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [7]:
# Clean data using the built in cleaner in gensim

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [8]:
# Split data into train and test set

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2)

In [9]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                  size=100,
                                  window=5,
                                  min_count=2)

In [10]:
# Explore the word vector for 'king' base on our trained model
w2v_model.wv['king']

array([-0.0382229 , -0.08024513,  0.00363246, -0.00600205,  0.00704879,
        0.00827206, -0.02581133, -0.02520023, -0.01668486, -0.00843029,
        0.03472609, -0.01818832,  0.02245494, -0.03729883,  0.01100051,
       -0.00573212, -0.03460041,  0.0910534 ,  0.030505  , -0.05989588,
        0.00347106, -0.05961369, -0.06237208, -0.06955219,  0.01672626,
        0.00273216,  0.01379673, -0.02019451, -0.04200555,  0.02316518,
       -0.015997  , -0.06479035, -0.0141787 ,  0.02666479,  0.028094  ,
       -0.04148806,  0.01400859, -0.00085329,  0.03799758,  0.02256233,
        0.04087701,  0.03179458, -0.05219363, -0.00702705,  0.05379624,
        0.01334256, -0.03127933, -0.0758174 , -0.05065704,  0.01773514,
       -0.01881688, -0.02277704, -0.03768161,  0.04188111,  0.00352809,
       -0.03256638,  0.04371538,  0.02179884, -0.01535647,  0.06309256,
       -0.01528405, -0.01593807, -0.04690412,  0.00105966, -0.01214068,
       -0.01989657,  0.00788707, -0.00904602,  0.01925175, -0.02

In [11]:
# Find the most similar words to 'king' based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('national', 0.9970955848693848),
 ('busy', 0.9970611333847046),
 ('ugh', 0.9970089793205261),
 ('able', 0.9969658255577087),
 ('again', 0.9969191551208496),
 ('both', 0.9969143867492676),
 ('eat', 0.9968969821929932),
 ('ok', 0.9968967437744141),
 ('now', 0.9968866109848022),
 ('after', 0.9968665242195129)]