# GloVe (Gensim)

For looking at word vectors, we'll use **Gensim**. **Gensim** isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.   We gonna use **GloVe** embeddings, downloaded at [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

In [1]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = 'file/glove.6B.100d.txt'
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [2]:
#return the vectors
model['coffee'].shape

(100,)

In [3]:
def open_file(path_to_file):
    content = []  # Initialize content to an empty list to avoid returning None
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()  # Read all lines of the file into a list
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")  # File not found error
    except Exception as e:
        print(f"An error occurred: {e}")  # Handle any other exceptions (e.g., permission issues)

    return content  # Return content even if it's empty, but not None


In [4]:
file_path = "file/word-test.v1.1.txt"

content = open_file(file_path)

semantic = []
syntatic = []

current_test = semantic
for sent in content:
    if sent[0] == ':':
        current_test = syntatic
        continue
    
    current_test.append(sent.strip())

semantic

In [6]:
sem_total = len(semantic)
sem_correct = 0

for sent in semantic:
    
    sent = sent.lower()
    words = sent.split(" ")

    try:
        result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    except:
        result = "<UNK>"

    if result == words[3]:
        sem_correct += 1
        
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.53


syntatic

In [8]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:

    sent = sent.lower()
    words = sent.split(" ")

    result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    
    if result == words[3]:
        syn_correct += 1

In [9]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.55


### Similarity

In [10]:
file_path = "file/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [11]:
import numpy as np

default_vector = np.zeros(model.vector_size)
try:
    result = model.get_vector('123123')
except:
    result = default_vector

In [27]:
import numpy as np

def compute_similarity(model, test_data):

    words = test_data.split("\t")

    default_vector = np.zeros(model.vector_size)
    try:
        embed0 = model.get_vector(words[0].strip())
        embed1 = model.get_vector(words[1].strip())
    except:
        embed0 = default_vector
        embed1 = default_vector
        
    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [28]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [29]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is 0.53.


save model

In [14]:
model.most_similar('obama')

[('barack', 0.937216579914093),
 ('bush', 0.9272855520248413),
 ('clinton', 0.896000325679779),
 ('mccain', 0.8875634074211121),
 ('gore', 0.8000320196151733),
 ('hillary', 0.7933663129806519),
 ('dole', 0.7851963639259338),
 ('rodham', 0.7518897652626038),
 ('romney', 0.7488929629325867),
 ('kerry', 0.7472624182701111)]

In [15]:
model.most_similar('coke')

[('cola', 0.8380194902420044),
 ('pepsi', 0.7717816233634949),
 ('coca', 0.7455084323883057),
 ('beer', 0.6947751045227051),
 ('pepsico', 0.6941383481025696),
 ('bottling', 0.6818284392356873),
 ('soda', 0.6482692360877991),
 ('drink', 0.6394657492637634),
 ('drinks', 0.6368611454963684),
 ('bottlers', 0.6348695755004883)]

In [16]:
model.most_similar('banana')

[('coconut', 0.7097253799438477),
 ('mango', 0.7054824829101562),
 ('bananas', 0.6887733936309814),
 ('potato', 0.6629636883735657),
 ('pineapple', 0.6534532308578491),
 ('fruit', 0.6519854068756104),
 ('peanut', 0.6420576572418213),
 ('pecan', 0.6349173188209534),
 ('cashew', 0.6294420957565308),
 ('papaya', 0.6246591210365295)]

In [17]:
model.most_similar('language')

[('languages', 0.8260655999183655),
 ('word', 0.7464082837104797),
 ('spoken', 0.7381494045257568),
 ('arabic', 0.7318817377090454),
 ('english', 0.7214903235435486),
 ('dialect', 0.6912704110145569),
 ('vocabulary', 0.6908208727836609),
 ('text', 0.685594916343689),
 ('translation', 0.6810674071311951),
 ('words', 0.6715823411941528)]

In [18]:
#multiple meanings....
model.most_similar("plant")

[('plants', 0.8918153047561646),
 ('factory', 0.7068111896514893),
 ('farm', 0.6553632616996765),
 ('facility', 0.6538199782371521),
 ('production', 0.6336487531661987),
 ('produce', 0.6246358156204224),
 ('processing', 0.6155514121055603),
 ('fertilizer', 0.6091734170913696),
 ('waste', 0.6080261468887329),
 ('factories', 0.6015971302986145)]

In [19]:
model.most_similar(negative='banana')

[('shunichi', 0.49618104100227356),
 ('ieronymos', 0.4736502170562744),
 ('pengrowth', 0.4668096601963043),
 ('höss', 0.4636845886707306),
 ('damaskinos', 0.46178486943244934),
 ('yadin', 0.4617375135421753),
 ('hundertwasser', 0.458895742893219),
 ('ncpa', 0.4577339291572571),
 ('maccormac', 0.45661094784736633),
 ('rothfeld', 0.4523947536945343)]

In [20]:
#woman + king - man
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7699


In [21]:
result = model.most_similar(positive=['italy', 'sushi'], negative=['japan'])
print("{}: {:.4f}".format(*result[0]))

tapas: 0.6232


### Cosine Similarity

We have talked about this in the last class.  Here we can conveniently use `distance` to find the cosine distance between two words. Note that distance = 1 - similarity.

In [22]:
w1 = "dog"
w2 = "cat"
w3 = "fruit"
w1_w2_dist = model.distance(w1, w2)
w1_w3_dist = model.distance(w1, w3)

#dog is much closer to cat then dog to fruit
w1_w2_dist, w1_w3_dist

(0.1201925277709961, 0.6231490671634674)

In [23]:
w1 = "happy" # synonym 1
w2 = "cheerful" # synonym 2
w3 = "sad" # antonym
w1_w2_dist = model.distance(w1, w2)
w1_w3_dist = model.distance(w1, w3)

#$w_1$="happy" is closer to $w_3$="sad" than to $w_2$="cheerful"!!
#those similarlity does not handle antonym....
w1_w2_dist, w1_w3_dist

(0.4540063142776489, 0.31988638639450073)

### Bias

You guys....one very important thing is that NLP models are biased.....very bad....

In [24]:
import pprint

pprint.pprint(model.most_similar(positive=['woman', 'worker'], negative=['man']))

[('nurse', 0.6614274978637695),
 ('employee', 0.6432636976242065),
 ('workers', 0.6231536865234375),
 ('migrant', 0.6021152138710022),
 ('immigrant', 0.5768847465515137),
 ('child', 0.5701467394828796),
 ('nurses', 0.5673795342445374),
 ('pregnant', 0.5660357475280762),
 ('nursing', 0.5648376941680908),
 ('teacher', 0.5609063506126404)]


In [None]:
pprint.pprint(model.most_similar(positive=['man', 'worker'], negative=['woman']))

In [None]:
pprint.pprint(model.most_similar(positive=["woman", "doctor"], negative=["man"]))

### Analogy

In [25]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [26]:
analogy('japan', 'japanese', 'australia')

'australian'

In [None]:
analogy('japan', 'sushi', 'italy')

In [None]:
analogy('australia', 'beer', 'france')

In [None]:
analogy('obama', 'clinton', 'reagan')

In [None]:
analogy('tall', 'tallest', 'long')

In [None]:
analogy('good', 'fantastic', 'bad')

In [None]:
analogy('bird', 'fly', 'human')

In [None]:
#which word in the list does not belong
print(model.doesnt_match("coke pepsi sprite water".split()))

### Visualization

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

words = ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
        'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
        'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
        'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
        'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
        'homework', 'assignment', 'problem', 'exam', 'test', 'class',
        'school', 'college', 'university', 'institute']

word_vectors = np.array([model[w] for w in words])

twodim = PCA().fit_transform(word_vectors)[:,:2]  #transform 100 to 2 dimensions

plt.figure(figsize=(10,10))
plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
for word, (x,y) in zip(words, twodim):
    plt.text(x+0.05, y+0.05, word)
plt.show()