## Training Embeddings

Word embeddings are an approach to representing text in NLP. In this notebook we will demonstrate how to train embeddings using Genism. Gensim is an open source Python library for natural language processing, with a focus on topic modeling

In [1]:
from gensim.models import Word2Vec

In [2]:
# define training data
#Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
#Every list contains lists of tokens of that document.
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

Hyperparameters : 
    
- **sg - Selecting the training algorithm: 1 for skip-gram else its 0 for CBOW. Default is CBOW.**
- **min_count- Ignores all words with total frequency lower than this.**

In [3]:
#Training the model
model_cbow = Word2Vec(corpus, min_count=1,sg=0) #using CBOW Architecture for trainnig
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)#using skipGram Architecture for training 

### Continuous Bag of Words (CBOW)

In [4]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.key_to_index)
print(words)

Word2Vec<vocab=6, vector_size=100, alpha=0.025>
['man', 'dog', 'eats', 'bites', 'food', 'meat']


In [5]:
#Acess vector for one word
print(model_cbow.wv['dog'])

[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-03
  3.5896183e-03  5.37033

In [6]:
#Compute similarity 
print("Similarity between eats and bites:",model_cbow.wv.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.wv.similarity('eats', 'man'))

Similarity between eats and bites: -0.013497081
Similarity between eats and man: -0.052354366


In [7]:
#Most similarity
model_cbow.wv.most_similar('meat')

[('food', 0.13887982070446014),
 ('bites', 0.13149003684520721),
 ('eats', 0.06422407925128937),
 ('dog', 0.009391157887876034),
 ('man', -0.05987631157040596)]

### SkipGram

In [8]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.key_to_index)
print(words)

#Acess vector for one word
print(model_skipgram.wv['dog'])

Word2Vec<vocab=6, vector_size=100, alpha=0.025>
['man', 'dog', 'eats', 'bites', 'food', 'meat']
[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7

In [9]:
#Compute similarity 
print("Similarity between eats and bites:",model_skipgram.wv.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.wv.similarity('eats', 'man'))

Similarity between eats and bites: -0.013518769
Similarity between eats and man: -0.052345097


In [10]:
#Most similarity
model_skipgram.wv.most_similar('meat')

[('food', 0.13887983560562134),
 ('bites', 0.13149002194404602),
 ('eats', 0.06406079977750778),
 ('dog', 0.009391169995069504),
 ('man', -0.059876300394535065)]

## Training Your Embedding on Wiki Corpus

In [11]:
file_name = "./data/enwiki-latest-pages-articles14.xml-p13159683p14324602.bz2"

In [12]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time

In [13]:
wiki = WikiCorpus(file_name,dictionary={})
sentences = list(wiki.get_texts())



In [14]:
len(sentences)

84243

In [15]:
start = time.time()
word2vec_cbow = Word2Vec(sentences,min_count=10, sg=0)
end = time.time()

print("CBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

CBOW Model Training Complete.
Time taken for training is:0.07 hrs 


In [16]:
#Summarize the loaded model
print(word2vec_cbow)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_cbow.wv.key_to_index)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(word2vec_cbow.wv['film'])}")
print(word2vec_cbow.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_cbow.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_cbow.wv.similarity('film', 'tiger'))
print("-"*30)

Word2Vec<vocab=114701, vector_size=100, alpha=0.025>
------------------------------
Length of vocabulary: 114701
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'were', 'which', 'are', 'this', 'new', 'first', 'be', 'or', 'had', 'one']
------------------------------
Length of vector: 100
[ 2.4798193   1.6299644  -1.9524587  -2.1632767  -0.8015589  -2.4180744
 -0.9288921   0.6917903  -0.7427525  -0.9527898   0.33223364 -1.7923205
  1.3321561  -0.573788   -0.56123525  0.7331837  -0.7210635  -1.9034867
 -1.0594628  -3.8944588   0.82641625 -1.0394018  -0.27613482  0.999485
 -2.9894524   0.7632584  -2.2548194  -2.9809794  -0.796569    1.1142012
 -0.78130347  0.03968257  0.6680792   0.46015945  1.2809261   1.9576032
 -2.3656642   1.3889844   1.8410498  -2.6152942   0.4751164   0.5279017
 -0.09113584 -3.0415146  -1.247852   -0.46009392  2.3148615  -1.9470344
  3.566701   -2.957744

In [17]:
# save model
from gensim.models import Word2Vec, KeyedVectors   
word2vec_cbow.wv.save_word2vec_format('word2vec_cbow.bin', binary=True)

# load model
# new_modelword2vec_cbow = Word2Vec.load('word2vec_cbow.bin')
# print(word2vec_cbow)

In [25]:
#SkipGram
start = time.time()
word2vec_skipgram = Word2Vec(sentences,min_count=10, sg=1)
end = time.time()

print("SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

SkipGram Model Training Complete
Time taken for training is:0.24 hrs 


In [28]:
#Summarize the loaded model
print(word2vec_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_skipgram.wv.key_to_index)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(word2vec_skipgram.wv['film'])}")
print(word2vec_skipgram.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_skipgram.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_skipgram.wv.similarity('film', 'tiger'))
print("-"*30)

Word2Vec<vocab=114701, vector_size=100, alpha=0.025>
------------------------------
Length of vocabulary: 114701
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'were', 'which', 'are', 'this', 'new', 'first', 'be', 'or', 'had', 'one']
------------------------------
Length of vector: 100
[ 0.25086927  0.34926364  0.32676318  0.34735006 -0.45518517 -0.00476236
 -0.00720366  0.6344674  -0.35669264 -0.27744132 -0.25112808 -0.42255458
  0.08205581  0.19419374 -0.586225    0.23438483  0.50972617  0.04808903
 -0.00883848 -0.49342445  0.12556285  0.19866785  0.21498752 -0.17337641
 -0.2987177   0.15012065 -0.50872517  0.3349206  -0.1557765   0.04826428
 -0.1847301  -0.50891805  0.35367274 -0.21916309 -0.19245264  0.3085634
 -0.00641272 -0.02233523  0.02466761 -0.59897727 -0.21583591  0.51929414
 -0.18047728 -0.30008414  0.44648957  0.09496464  0.01635326 -0.39480928
  0.19230269 -

In [29]:
# save model
# word2vec_skipgram.wv.save_word2vec_format('word2vec_sg.bin', binary=True)

# load model
# new_model_skipgram = Word2Vec.load('model_skipgram.bin')
# print(model_skipgram)

## FastText

In [30]:
#CBOW
start = time.time()
fasttext_cbow = FastText(sentences, sg=0, min_count=10)
end = time.time()

print("FastText CBOW Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

FastText CBOW Model Training Complete
Time taken for training is:0.36 hrs 


In [32]:
#Summarize the loaded model
print(fasttext_cbow)
print("-"*30)

#Summarize vocabulary
words = list(fasttext_cbow.wv.key_to_index)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(fasttext_cbow.wv['film'])}")
print(fasttext_cbow.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",fasttext_cbow.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_cbow.wv.similarity('film', 'tiger'))
print("-"*30)

FastText<vocab=114701, vector_size=100, alpha=0.025>
------------------------------
Length of vocabulary: 114701
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'were', 'which', 'are', 'this', 'new', 'first', 'be', 'or', 'had', 'one']
------------------------------
Length of vector: 100
[-1.1028962  -1.1424731   4.2971435   3.093765   -1.9298153  -1.573785
  0.7352989  -0.06200884  4.0854506   2.5948439   6.03334    -1.5191512
 -2.8226764   2.422655   -0.02007237  4.245388    1.9633343  -1.2209948
 -0.225725    5.744732    0.15987073  2.4027047  -4.1807213   2.2000227
 -1.6735848   1.5635186   3.8370767   1.9344715  -2.2298977   2.3143842
  0.30345052 -4.899117    0.57759863 -2.098047   -3.9966545   4.096216
  6.4856396   1.3798109   2.9979613   0.41648358 -0.56078905 -2.5152786
  1.3450183   1.1636597  -4.128712   -4.2033005  -0.20914441  3.489923
 -1.6094836  -1.2102098 

In [33]:
#SkipGram
start = time.time()
fasttext_skipgram = FastText(sentences, sg=1, min_count=10)
end = time.time()

print("FastText SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

FastText SkipGram Model Training Complete
Time taken for training is:0.56 hrs 


In [35]:
#Summarize the loaded model
print(fasttext_cbow)
print("-"*30)

#Summarize vocabulary
words = list(fasttext_cbow.wv.key_to_index)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(fasttext_cbow.wv['film'])}")
print(fasttext_cbow.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",fasttext_cbow.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_cbow.wv.similarity('film', 'tiger'))
print("-"*30)

FastText<vocab=114701, vector_size=100, alpha=0.025>
------------------------------
Length of vocabulary: 114701
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'were', 'which', 'are', 'this', 'new', 'first', 'be', 'or', 'had', 'one']
------------------------------
Length of vector: 100
[-1.1028962  -1.1424731   4.2971435   3.093765   -1.9298153  -1.573785
  0.7352989  -0.06200884  4.0854506   2.5948439   6.03334    -1.5191512
 -2.8226764   2.422655   -0.02007237  4.245388    1.9633343  -1.2209948
 -0.225725    5.744732    0.15987073  2.4027047  -4.1807213   2.2000227
 -1.6735848   1.5635186   3.8370767   1.9344715  -2.2298977   2.3143842
  0.30345052 -4.899117    0.57759863 -2.098047   -3.9966545   4.096216
  6.4856396   1.3798109   2.9979613   0.41648358 -0.56078905 -2.5152786
  1.3450183   1.1636597  -4.128712   -4.2033005  -0.20914441  3.489923
 -1.6094836  -1.2102098 

In [36]:
#SkipGram
start = time.time()
fasttext_skipgram = FastText(sentences, sg=1, min_count=10)
end = time.time()

print("FastText SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

FastText SkipGram Model Training Complete
Time taken for training is:0.50 hrs 


In [37]:
#Summarize the loaded model
print(fasttext_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(fasttext_skipgram.wv.key_to_index)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(fasttext_skipgram.wv['film'])}")
print(fasttext_skipgram.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",fasttext_skipgram.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_skipgram.wv.similarity('film', 'tiger'))
print("-"*30)

FastText<vocab=114701, vector_size=100, alpha=0.025>
------------------------------
Length of vocabulary: 114701
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'were', 'which', 'are', 'this', 'new', 'first', 'be', 'or', 'had', 'one']
------------------------------
Length of vector: 100
[ 4.36875135e-01  3.03066134e-01  3.18868384e-02 -2.42166236e-01
 -1.42180741e-01  1.99038222e-01  6.16607890e-02 -2.71127194e-01
 -4.55392361e-01 -1.19545072e-01 -7.79057965e-02 -2.10862458e-02
 -7.12426722e-01  4.58694428e-01 -5.59816360e-01 -5.22863805e-01
 -2.62240857e-01 -5.13919815e-02  2.89212465e-01 -4.99249756e-01
 -7.91127037e-04  4.60370719e-01  1.74896657e-01  3.30697030e-01
  1.16966538e-01  7.87553489e-02  1.82809398e-01 -4.64557379e-01
 -2.84144521e-01 -3.36322375e-02 -4.36833858e-01  7.42739737e-02
  3.63787651e-01 -2.61090606e-01 -4.08437163e-01 -7.42567778e-01
  5.75121343