# Word2Vec:

Here I am implementing Word2Vec from scratch. I am not using the pre-trained models 
like google's: word2vec-google-news-300, which by default has 300 dims for each word.

In [1]:
import re
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import numpy as np

In [2]:
# Lets start with a small text data 1st:

data= 'hello everybody!. Welcome to NLP tutorial. We are going to see the simple text preprocessing. Going \
to use stemmer & lemmatization from nltk-library.'

In [3]:
#lets breakdown the text in sentences:

sentences= nltk.sent_tokenize(data)

In [4]:
sentences

['hello everybody!.',
 'Welcome to NLP tutorial.',
 'We are going to see the simple text preprocessing.',
 'Going to use stemmer & lemmatization from nltk-library.']

In [5]:
#apply lemmatization & cleaning data 


lemma= WordNetLemmatizer()

# for sents in corpus:
#     words= nltk.word_tokenize(sents)
#     for word in words:
#         if word not in set(stopwords.words('english')):
#             print(lemma.lemmatize(word))
                           
corpus=[]        
for i in range(len(sentences)):
    words= re.sub('[^a-zA-Z0-9]',' ', sentences[i])
    words= nltk.word_tokenize(words)
    words= [lemma.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    words= ' '.join(words)
    corpus.append(words)

In [6]:
corpus

['hello everybody',
 'Welcome NLP tutorial',
 'We going see simple text preprocessing',
 'Going use stemmer lemmatization nltk library']

In [7]:
# here simple_preprocessing smaller the words and also breaks sentences in to words list of each sentence.

words=[]
for word in corpus:
    words.append(simple_preprocess(word))

In [8]:
words

[['hello', 'everybody'],
 ['welcome', 'nlp', 'tutorial'],
 ['we', 'going', 'see', 'simple', 'text', 'preprocessing'],
 ['going', 'use', 'stemmer', 'lemmatization', 'nltk', 'library']]

In [9]:
#lets build the word2vec model. This model's default dimension of word vector is 100.

model= gensim.models.Word2Vec(words, window=3, min_count=1)


In [10]:
model.wv.index_to_key

['going',
 'library',
 'nltk',
 'lemmatization',
 'stemmer',
 'use',
 'preprocessing',
 'text',
 'simple',
 'see',
 'we',
 'tutorial',
 'nlp',
 'welcome',
 'everybody',
 'hello']

In [11]:
model.wv.similar_by_word('use')

[('tutorial', 0.16694684326648712),
 ('stemmer', 0.13887985050678253),
 ('lemmatization', 0.13149002194404602),
 ('nltk', 0.06408979743719101),
 ('simple', 0.06059184670448303),
 ('welcome', 0.044106729328632355),
 ('we', 0.020000355318188667),
 ('preprocessing', 0.019152285531163216),
 ('library', 0.009391160681843758),
 ('hello', -0.0262751504778862)]

In [12]:
model.corpus_count

4

In [13]:
model.epochs

5

In [14]:
model.wv['going'].shape

(100,)

In [15]:
model.wv.similarity('going','see')

0.21617141

In [16]:
model.wv.most_similar('going')

[('see', 0.21617142856121063),
 ('nlp', 0.0931011214852333),
 ('text', 0.09291722625494003),
 ('tutorial', 0.07963486760854721),
 ('we', 0.06285078823566437),
 ('hello', 0.05433367192745209),
 ('simple', 0.0270574688911438),
 ('preprocessing', 0.016134677454829216),
 ('library', -0.01083916611969471),
 ('stemmer', -0.027750369161367416)]

In [17]:
#lets print the vector for all the words

def w2vector(doc):
    return ([model.wv[word]for word in doc if word in model.wv.index_to_key])

In [18]:
word_vectors=[]
for i in range(len(words)):
    word_vectors.append(w2vector(words[i]))

In [19]:
type(word_vectors)

list

In [21]:
word_vectors[0]

[array([-7.1909428e-03,  4.2328904e-03,  2.1633946e-03,  7.4407146e-03,
        -4.8892652e-03, -4.5643463e-03, -6.0981740e-03,  3.2993674e-03,
        -4.4994629e-03,  8.5228849e-03, -4.2888271e-03, -9.1054197e-03,
        -4.8163556e-03,  6.4164903e-03, -6.3713240e-03, -5.2615367e-03,
        -7.3044109e-03,  6.0222615e-03,  3.3575939e-03,  2.8483903e-03,
        -3.1385506e-03,  6.0308911e-03, -6.1527453e-03, -1.9801008e-03,
        -5.9830821e-03, -9.9568011e-04, -2.0209861e-03,  8.4859459e-03,
         7.8001023e-05, -8.5753258e-03, -5.4290984e-03, -6.8759858e-03,
         2.6923812e-03,  9.4566476e-03, -5.8159959e-03,  8.2650259e-03,
         8.5320519e-03, -7.0626391e-03, -8.8832127e-03,  9.4691841e-03,
         8.3743641e-03, -4.6908916e-03, -6.7260410e-03,  7.8421365e-03,
         3.7633455e-03,  8.0955038e-03, -7.5715459e-03, -9.5250849e-03,
         1.5774060e-03, -9.8057678e-03, -4.8858845e-03, -3.4601032e-03,
         9.6209226e-03,  8.6235693e-03, -2.8356076e-03,  5.82687

# AVG_WORD2VEC

In [22]:
def avg_w2vec(doc):
    return np.mean([model.wv[word]for word in doc if word in model.wv.index_to_key], axis=0)

In [23]:
avg_word_vectors=[]
for i in range(len(words)):
    avg_word_vectors.append(avg_w2vec(words[i]))

In [24]:
avg_word_vectors[0]

array([ 2.52851751e-04,  6.67676609e-03,  1.64944818e-03, -4.42182412e-04,
        1.76787539e-03, -4.13028849e-03, -1.78000424e-04,  3.84547329e-03,
        2.59524095e-03, -3.85306310e-04,  2.45978916e-03, -9.19347443e-03,
       -5.86203393e-03, -1.34285213e-03, -5.95921697e-03,  1.05367973e-03,
        9.30033391e-04,  1.34845497e-03,  3.54032218e-03, -3.88406566e-04,
        2.37146020e-03,  5.94888348e-03, -3.07626836e-03, -2.80438783e-03,
       -6.60369406e-03,  1.88646792e-03, -2.84003618e-04,  2.93637998e-03,
        3.95790394e-03, -6.31246995e-03, -7.28904223e-03, -4.56572836e-03,
        1.40876416e-03,  1.40869617e-03, -5.65130590e-03, -1.17375515e-04,
        8.88096262e-03,  1.80694507e-04, -4.58922796e-03,  8.41842405e-03,
        8.16257671e-03, -2.73723248e-03, -5.69750555e-05,  5.80483023e-03,
        4.42009419e-03,  7.67424749e-03, -6.15546759e-03, -5.85530885e-03,
        1.22526474e-03, -2.78478093e-03, -7.90776568e-04,  8.17862106e-04,
        7.10370392e-03,  