## Vector Representations: word2vec in Python 3.6

DOCUMENTATION 
https://radimrehurek.com/gensim/models/word2vec.html

https://rare-technologies.com/word2vec-tutorial/

http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

### Importing Packages

In [1]:
import numpy as np
import pandas as pd

# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer
# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Stopwords
from nltk.corpus import stopwords, state_union, brown, movie_reviews, treebank

# --- GENSIM PACKAGE ---
import gensim, logging
from gensim.models import Word2Vec, Doc2Vec

### Loading Datasets/Inputs

In [2]:
# Sentences
brown_sents = brown.sents()
movie_sents= movie_reviews.sents()
treebank_sents = treebank.sents()

In [3]:
test_input = '''My name is Pranjal Pathak. 
                My gender is Male. I am 23 years old. 
                I live in Bangalore. I like driving. 
                I have lived in Varanasi before but I like Bangalore more. 
                Phani is a nice girl. Her gender is Female.'''

### Tokenizing

In [4]:
my_sents = sent_tokenize(test_input)

In [5]:
my_sent_words = []

for line in my_sents:
    my_sent_words.append(word_tokenize(line))

In [6]:
my_sent_words

[['My', 'name', 'is', 'Pranjal', 'Pathak', '.'],
 ['My', 'gender', 'is', 'Male', '.'],
 ['I', 'am', '23', 'years', 'old', '.'],
 ['I', 'live', 'in', 'Bangalore', '.'],
 ['I', 'like', 'driving', '.'],
 ['I',
  'have',
  'lived',
  'in',
  'Varanasi',
  'before',
  'but',
  'I',
  'like',
  'Bangalore',
  'more',
  '.'],
 ['Phani', 'is', 'a', 'nice', 'girl', '.'],
 ['Her', 'gender', 'is', 'Female', '.']]

### MODEL

In [7]:
''' MODEL ARCHITECTURE

    Vocab(V) = {word1, word2, word3,...., wordV}; Set of all unique words in the input doc
    
                  Input = Word1 [1,0,0,0,.....0]; V dim
           Hidden Layer = 600 Neurons; Weights  = word1: w1,w2,w3,....wn; N dim weights
    Second Hidden Layer = 600 Neurons; Weights' = w'1,w'2,w'3,....w'n; N dim weights'
        Output(Softmax) = [0.78, 0.21, 0.11, ....]; V dim (Prob of relation of word1 with other words)
        
    KEY--
    `size` is the dimensionality of the feature vectors = 100; 100 weights or features(w0,w1,w2......w99)
    `window` is the maximum distance between the current and predicted word within a sentence.
    `min_count` = ignore all words with total frequency lower than this.
'''

## Training our model with our input data
model_word2vec = Word2Vec(my_sent_words, size = 100, window = 10, hs=1, negative=0, workers = 4, min_count=1)

### Word2Vec Methods

In [8]:
# Most Similar n words with prob
model_word2vec.most_similar('Varanasi', topn=5)

[('Her', 0.14535093307495117),
 ('.', 0.1166599690914154),
 ('23', 0.11133888363838196),
 ('nice', 0.09093289077281952),
 ('I', 0.08679273724555969)]

In [9]:
model_word2vec.doesnt_match("Pranjal")

'a'

In [10]:
# Comparison between two words
model_word2vec.similarity('Bangalore', 'My')*100

6.2146100324732911

In [11]:
# Array of Vectors
model_word2vec['Bangalore']

array([-0.00014494,  0.00131348,  0.00441421,  0.00300143,  0.00473957,
        0.00296812, -0.00146338, -0.0013057 ,  0.00242703,  0.00011909,
       -0.00186478, -0.00403753, -0.0013407 ,  0.00423598, -0.0024287 ,
       -0.00244268,  0.00202265,  0.00329478,  0.00188473,  0.00108233,
       -0.0017921 , -0.00485223,  0.00362264, -0.00436838, -0.00102882,
        0.00131091, -0.00204977,  0.00246129,  0.00071651,  0.00208738,
        0.00320785, -0.00190853, -0.00083322,  0.0049674 ,  0.00241672,
       -0.0021599 , -0.00330988,  0.00305857, -0.00023285, -0.00459173,
       -0.00202542, -0.00114177, -0.00228285, -0.00089524,  0.0035717 ,
       -0.00485337,  0.00451534,  0.00496266,  0.00022294,  0.00022845,
       -0.00487773, -0.00331864,  0.00469837, -0.00227686,  0.00336816,
       -0.00282172, -0.00391813, -0.00362246, -0.00134024,  0.0028178 ,
        0.00066636, -0.00085644, -0.00201165,  0.0043496 , -0.00130573,
        0.00073373, -0.00222039, -0.00478293, -0.00183628, -0.00

In [12]:
model_word2vec.score(["My name is Pranjal".split()])[0]

-14.195734

In [13]:
model_word2vec.most_similar(positive=['Male', 'Female'], negative=['Pranjal'])

[('.', 0.27271872758865356),
 ('a', 0.20817896723747253),
 ('like', 0.18685702979564667),
 ('Her', 0.18171341717243195),
 ('before', 0.1614219844341278),
 ('in', 0.13635139167308807),
 ('Phani', 0.1289747804403305),
 ('lived', 0.1246543675661087),
 ('girl', 0.11097618192434311),
 ('nice', 0.0878562405705452)]

In [14]:
model_word2vec.most_similar_cosmul(positive=['Male', 'Female'], negative=['Pranjal'])

[('.', 0.7263284921646118),
 ('a', 0.6934047937393188),
 ('Her', 0.6659020781517029),
 ('like', 0.6518145203590393),
 ('before', 0.6323356628417969),
 ('in', 0.6142411828041077),
 ('lived', 0.597561776638031),
 ('Phani', 0.5940378904342651),
 ('girl', 0.5771189332008362),
 ('nice', 0.576112687587738)]