# Word Embeddings
### Dr. Sal Barbosa, Department of Computer Science, Middle Tennessee State University

In [1]:
import gensim
from gensim.models import Word2Vec
from nltk.corpus import brown
from time import time

### Warning: Demonstration code is for information/home use. <u>Do not</u> load vectors onto Azure Jupyter Hub

In [2]:
# If memory-limited, locate the pruned word2vec sample in NLTK
#from nltk.data import find
#word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

### Load word2vec pre-trained embeddings

In [3]:
#start = time()
#model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) # load in word2vec format
word2vec_file = "/home/sbarbosa/data/word-embeddings/GoogleNews-vectors-negative300-SLIM.bin"
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=True) # load in word2vec format
#print(time()-start)

### Load Glove pre-trained embeddings

In [4]:
glove_file = "/home/sbarbosa/data/word-embeddings/glove.6B/glove.6B.300d.txt"
glv_model = gensim.models.KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

### Load fastText pre-trained embeddings

In [5]:
fastxt_file = "/home/sbarbosa/data/word-embeddings/fasttext-wiki-news-300d-1M.vec"
fastxt_model = gensim.models.KeyedVectors.load_word2vec_format(fastxt_file)

In [6]:
# Get each model's vocabulary
w2v_vocab = set(w2v_model.index_to_key)
glv_vocab = set(glv_model.index_to_key)
ft_vocab = set(fastxt_model.index_to_key)

### Number of words in each model

In [7]:
print(len(w2v_vocab), len(glv_vocab), len(ft_vocab), sep='\t\t')

299567		400000		999994


### Vector dimensions

In [8]:
# Note: words may have to be converted between lower and title case (and possibly upper case?) to see if they exist in the vocabulary
print(len(w2v_model['avocado']), len(glv_model['avocado']), len(fastxt_model['avocado']), sep='\t\t') # dimensions of vector representing word 'avocado' (and all other words)

300		300		300


### Most similar to target word

In [9]:
tgt_word = 'wagon'
w2v_most_sim = w2v_model.most_similar(tgt_word, topn=15)
glv_most_sim = glv_model.most_similar(tgt_word, topn=15)
ft_most_sim = fastxt_model.most_similar(tgt_word, topn=15)
for i in range(15):
    print(f"{str(w2v_most_sim[i]):<42}{str(glv_most_sim[i]):<42}{str(ft_most_sim[i]):<42}")

('wagons', 0.6923391819000244)            ('wagons', 0.7252909541130066)            ('wagons', 0.7765060663223267)            
('haywagon', 0.5886642336845398)          ('sedan', 0.5901995897293091)             ('waggon', 0.6985791921615601)            
('waggon', 0.5873774290084839)            ('carriage', 0.5468588471412659)          ('Wagon', 0.6592865586280823)             
('hayrack', 0.5782635807991028)           ('suv', 0.5423933267593384)               ('truck', 0.6552196741104126)             
('buckboard', 0.5725746750831604)         ('sedans', 0.5229743719100952)            ('buckboard', 0.6439283490180969)         
('caboose', 0.5667509436607361)           ('jeep', 0.5170666575431824)              ('cart', 0.6253586411476135)              
('tractor', 0.5646648406982422)           ('horse-drawn', 0.5018145442008972)       ('dray', 0.6129499077796936)              
('truck', 0.5361797213554382)             ('truck', 0.49931150674819946)            ('sedan', 0.610536754131317

### Most similar to target word (using cosine similarity) 

In [10]:
# most similar using cosine similarity
tn = 25
w2v_most_sim_cos = w2v_model.most_similar_cosmul('wagon', topn=tn)
glv_most_sim_cos = glv_model.most_similar_cosmul('wagon', topn=tn)
ft_most_sim_cos = fastxt_model.most_similar_cosmul('wagon', topn=tn)
for i in range(tn):
    print(f"{str(w2v_most_sim_cos[i]):<42}{str(glv_most_sim_cos[i]):<42}{str(ft_most_sim_cos[i]):<42}")

('wagons', 0.8461688160896301)            ('wagons', 0.8626446723937988)            ('wagons', 0.8882521986961365)            
('haywagon', 0.7943313717842102)          ('sedan', 0.7950990200042725)             ('waggon', 0.8492887616157532)            
('waggon', 0.7936879992485046)            ('carriage', 0.7734286785125732)          ('Wagon', 0.8296424746513367)             
('hayrack', 0.7891311049461365)           ('suv', 0.7711959481239319)               ('truck', 0.8276090621948242)             
('buckboard', 0.7862865328788757)         ('sedans', 0.7614864706993103)            ('buckboard', 0.821963369846344)          
('caboose', 0.7833747267723083)           ('jeep', 0.7585326433181763)              ('cart', 0.8126785159111023)              
('tractor', 0.782331645488739)            ('horse-drawn', 0.7509065866470337)       ('dray', 0.8064741492271423)              
('truck', 0.7680891752243042)             ('truck', 0.74965500831604)               ('sedan', 0.805267632007598

### Impact of pluralization and capitalization - Note: Glove is lowercase only)

In [11]:
# [Capitalization and] Pluralization can lead to different most_similar results
for w in ('room', 'Room', 'rooms'):
    w2v_most_sim = w2v_model.most_similar(w, topn=10)
    glv_most_sim = glv_model.most_similar(w.lower(), topn=10)
    ft_most_sim = fastxt_model.most_similar(w, topn=10)
    print(w)
    for i in range(10):
        print(f"{str(w2v_most_sim[i]):<42}{str(glv_most_sim[i]):<42}{str(ft_most_sim[i]):<42}")
        #print(f"{str(w2v_most_sim[i]):<42}{str(ft_most_sim[i]):<42}")
    print('-'*50)

room
('rooms', 0.7605787515640259)             ('rooms', 0.7976906299591064)             ('rooms', 0.8117178678512573)             
('upstairs', 0.6226500868797302)          ('upstairs', 0.6701579093933105)          ('hallway', 0.7001889944076538)           
('hallway', 0.6086892485618591)           ('floor', 0.6689373850822449)             ('room-', 0.6919785737991333)             
('downstairs', 0.5930778384208679)        ('bedroom', 0.6352986693382263)           ('locker', 0.6848980188369751)            
('bathroom', 0.5513426661491394)          ('locker', 0.6267264485359192)            ('bedroom', 0.6742653846740723)           
('kitchenette', 0.5502403974533081)       ('dining', 0.620658278465271)             ('Room', 0.660754919052124)               
('basement', 0.5274614095687866)          ('hallway', 0.6061124801635742)           ('bathroom', 0.6546382904052734)          
('lounge', 0.5249817967414856)            ('bathroom', 0.5993543267250061)          ('house', 0.6517252922

### Similarity between two words

In [12]:
# cosine similarity between two words
print(w2v_model.similarity('bolt','bread'), glv_model.similarity('bolt','bread'), fastxt_model.similarity('bolt','bread'), sep='\t\t')

0.096661426		0.06928948		0.35737234


### Similarity between sets of words (e.g., tokenized sentences)

In [13]:
# cosine similarity between two sets of words
print(w2v_model.n_similarity(['Elena','bought','a','book','today'], ['New','York','is','a','very','large','city']),
      glv_model.n_similarity(['Elena','bought','a','book','today'], ['New','York','is','a','very','large','city']),
      fastxt_model.n_similarity(['Elena','bought','a','book','today'], ['New','York','is','a','very','large','city']),sep='\t\t')

0.23679209		0.7455188		0.7492945


### Vector similarity

In [14]:
#model.similar_by_word('roadster',topn=15) does not return self but is otherwise similar
w2v_sim_by_vec = w2v_model.similar_by_vector(w2v_model['roadster'],topn=15)
glv_sim_by_vec = glv_model.similar_by_vector(glv_model['roadster'],topn=15)
ft_sim_by_vec = fastxt_model.similar_by_vector(fastxt_model['roadster'],topn=15)
for i in range(15):
    print(f"{str(w2v_sim_by_vec[i]):<42}{str(glv_sim_by_vec[i]):<42}{str(ft_sim_by_vec[i]):<42}")

('roadster', 0.9999998211860657)          ('roadster', 1.0)                         ('roadster', 1.0)                         
('coupe', 0.8324609994888306)             ('coupe', 0.645466685295105)              ('Roadster', 0.7570767998695374)          
('Roadster', 0.8124524354934692)          ('roadsters', 0.626401960849762)          ('roadsters', 0.7363091111183167)         
('supercar', 0.7651873230934143)          ('miata', 0.5967004299163818)             ('two-seater', 0.7275443077087402)        
('ragtop', 0.7415428161621094)            ('z3', 0.5841705799102783)                ('coupe', 0.7113063335418701)             
('coupes', 0.7365368008613586)            ('z4', 0.5711422562599182)                ('sportscar', 0.7004302740097046)         
('cabriolet', 0.7353311777114868)         ('hatchback', 0.5568931102752686)         ('tourer', 0.6802168488502502)            
('Coupe', 0.7215216755867004)             ('slk', 0.5495831370353699)               ('sedan', 0.677300751209259

### One of these things is not like the others

In [15]:
# select most "unlike" item
print(w2v_model.doesnt_match(['guitar', 'trumpet', 'tuba', 'flute']),
      glv_model.doesnt_match(['guitar', 'trumpet', 'tuba', 'flute']), 
      fastxt_model.doesnt_match(['guitar', 'trumpet', 'tuba', 'flute']), sep='\t\t')

guitar		tuba		guitar


### Extracting analogies

In [16]:
# most_similar takes collections of vectors to be added (positive) or subtracted (negative) 
# Can use to specify an analogy: Read as "Paris is to France as Madrid is to ?""
w2v_analogy = w2v_model.most_similar(positive=['France', 'Madrid'], negative=['Paris'], topn=15)
glv_analogy = glv_model.most_similar(positive=['france', 'madrid'], negative=['paris'], topn=15)
ft_analogy = fastxt_model.most_similar(positive=['france', 'madrid'], negative=['paris'], topn=15)
for i in range(15):
    print(f"{str(w2v_analogy[i]):<42}{str(glv_analogy[i]):<42}{str(ft_analogy[i]):<42}")

('Spain', 0.7776164412498474)             ('spain', 0.7692385315895081)             ('spain', 0.6449561715126038)             
('Portugal', 0.6343989372253418)          ('valencia', 0.5684595108032227)          ('barcelona', 0.6360149383544922)         
('Spaniards', 0.6030073165893555)         ('spanish', 0.5646007061004639)           ('atletico', 0.6198720335960388)          
('Barcelona', 0.5761322379112244)         ('barcelona', 0.5638340711593628)         ('barca', 0.6192523241043091)             
('Zapatero', 0.572990894317627)           ('argentina', 0.539618730545044)          ('bayern', 0.6114077568054199)            
('Catalan', 0.5721346139907837)           ('portugal', 0.5387835502624512)          ('uruguay', 0.6079416871070862)           
('Catalonia', 0.5707486867904663)         ('sevilla', 0.5046039819717407)           ('england', 0.5941749811172485)           
('Argentina', 0.5653747916221619)         ('italy', 0.5028786659240723)             ('athletico', 0.59402942657

### Training a custom Word2Vec model

In [17]:
# How-to train custom Word2Vec model (this is trained with the Brown corpus)
custommodel = Word2Vec(list(brown.sents()), vector_size=300, window=5, min_count=20)
cust_vocab = set(custommodel.wv.index_to_key)
print(len(cust_vocab))

5164


In [18]:
# To supress deprecation error in custom models or those loaded from saved files, prefix method call with "wv.""
# so to call the most_similar method, use
cst_most_sim = custommodel.wv.most_similar('wagon',topn=15)  # Note that the answers are much different from those of the pre-trained word2vec model
w2v_most_sim = w2v_model.most_similar('wagon', topn=15)
glv_most_sim = glv_model.most_similar('wagon', topn=15)
ft_most_sim = fastxt_model.most_similar('wagon', topn=15)
for i in range(15):
    print(f"{str(w2v_most_sim[i]):<40}{str(glv_most_sim[i]):<40}{str(ft_most_sim[i]):<40}")

('wagons', 0.6923391819000244)          ('wagons', 0.7252909541130066)          ('wagons', 0.7765060663223267)          
('haywagon', 0.5886642336845398)        ('sedan', 0.5901995897293091)           ('waggon', 0.6985791921615601)          
('waggon', 0.5873774290084839)          ('carriage', 0.5468588471412659)        ('Wagon', 0.6592865586280823)           
('hayrack', 0.5782635807991028)         ('suv', 0.5423933267593384)             ('truck', 0.6552196741104126)           
('buckboard', 0.5725746750831604)       ('sedans', 0.5229743719100952)          ('buckboard', 0.6439283490180969)       
('caboose', 0.5667509436607361)         ('jeep', 0.5170666575431824)            ('cart', 0.6253586411476135)            
('tractor', 0.5646648406982422)         ('horse-drawn', 0.5018145442008972)     ('dray', 0.6129499077796936)            
('truck', 0.5361797213554382)           ('truck', 0.49931150674819946)          ('sedan', 0.6105367541313171)           
('Wagons', 0.5292890071868896)  

In [19]:
for i in range(15):
    print(f"{str(cst_most_sim[i]):<40}")

('seat', 0.9624314904212952)            
('tent', 0.9600862860679626)            
('shoulders', 0.9581727981567383)       
('flying', 0.9530327320098877)          
('driving', 0.9519249796867371)         
('tree', 0.9506455659866333)            
('gathered', 0.9504652619361877)        
('knee', 0.949884831905365)             
('stretched', 0.9496535658836365)       
('valley', 0.9487352967262268)          
('holding', 0.948331892490387)          
('rifle', 0.9481486082077026)           
('coat', 0.9473273158073425)            
('grabbed', 0.9471802711486816)         
('wind', 0.9471650719642639)            


In [20]:
cst_sims = set([itm[0] for itm in cst_most_sim])
#cst_sims.add('wagon')
#cst_sims.add('Wagon')

In [21]:
cst_sims

{'coat',
 'driving',
 'flying',
 'gathered',
 'grabbed',
 'holding',
 'knee',
 'rifle',
 'seat',
 'shoulders',
 'stretched',
 'tent',
 'tree',
 'valley',
 'wind'}

In [22]:
len(brown.sents())

57340

In [23]:
# It is instructive to see the sentences that actually contain the search term and its most similars
for sent in list(brown.sents()):
    if ('wagon' in sent or 'Wagon' in sent) and set(sent) & cst_sims:
        print(sent)
        print()

['When', 'the', 'two', 'cars', 'were', 'equidistant', 'from', 'him', ',', 'the', 'station', 'wagon', 'started', 'up', 'again', 'and', 'the', 'Ford', 'gathered', 'speed', '.']

['Leaving', 'his', 'rifle', 'in', 'the', 'wagon', ',', 'Tilghman', 'walked', 'up', 'to', 'the', 'door', 'and', 'hammered', 'on', 'it', '.']

