# word2vec Implementation

In [2]:
# importing all necessary modules 
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

In [10]:
sample = open("../data/alice.txt", "r") 
s = sample.read() 

In [14]:
# Replaces escape character with space 
f = s.replace("\n", " ") 

In [15]:
data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = []  
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

In [16]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5) 
  
# Print results 
print("Cosine similarity between 'alice' " + "and 'wonderland' - CBOW : ", 
    model1.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " + "and 'machines' - CBOW : ", 
      model1.similarity('alice', 'machines')) 

INFO - 16:43:15: collecting all words and their counts
INFO - 16:43:15: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:43:15: collected 3532 word types from a corpus of 38098 raw words and 1102 sentences
INFO - 16:43:15: Loading a fresh vocabulary
INFO - 16:43:15: effective_min_count=1 retains 3532 unique words (100% of original 3532, drops 0)
INFO - 16:43:15: effective_min_count=1 leaves 38098 word corpus (100% of original 38098, drops 0)
INFO - 16:43:16: deleting the raw counts dictionary of 3532 items
INFO - 16:43:16: sample=0.001 downsamples 51 most-common words
INFO - 16:43:16: downsampling leaves estimated 24840 word corpus (65.2% of prior 38098)
INFO - 16:43:16: estimated required memory for 3532 words and 100 dimensions: 4591600 bytes
INFO - 16:43:16: resetting layer weights
INFO - 16:43:16: training model with 3 workers on 3532 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO - 16:43:16: worker thread finished; awa

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9994733
Cosine similarity between 'alice' and 'machines' - CBOW :  0.9893601


In [17]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1) 
  
# Print results 
print("Cosine similarity between 'alice' " + "and 'wonderland' - Skip Gram : ", 
      model2.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " + "and 'machines' - Skip Gram : ", 
      model2.similarity('alice', 'machines')) 

INFO - 16:43:42: collecting all words and their counts
INFO - 16:43:42: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:43:42: collected 3532 word types from a corpus of 38098 raw words and 1102 sentences
INFO - 16:43:42: Loading a fresh vocabulary
INFO - 16:43:42: effective_min_count=1 retains 3532 unique words (100% of original 3532, drops 0)
INFO - 16:43:42: effective_min_count=1 leaves 38098 word corpus (100% of original 38098, drops 0)
INFO - 16:43:42: deleting the raw counts dictionary of 3532 items
INFO - 16:43:42: sample=0.001 downsamples 51 most-common words
INFO - 16:43:42: downsampling leaves estimated 24840 word corpus (65.2% of prior 38098)
INFO - 16:43:42: estimated required memory for 3532 words and 100 dimensions: 4591600 bytes
INFO - 16:43:42: resetting layer weights
INFO - 16:43:43: training model with 3 workers on 3532 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
INFO - 16:43:43: worker thread finished; awa

Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.8719112
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.842918


**References**
- [Word Embedding using Word2Vec](https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/)
- [Gensim Word2Vec Tutorial](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial/data?select=simpsons_dataset.csv)