### Words similarity by GloVe

In order to capture in a quantitative way the nuance necessary to distinguish man from woman, it is necessary for a model to associate more than a single number to the word pair. A natural and simple candidate for an enlarged set of discriminative numbers is the <b>vector difference between the two word vectors</b>.  
  

GloVe is designed in order that such vector differences capture as much as possible the meaning specified by the juxtaposition of two words.

In [None]:
import os
os.getcwd()

In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
import pandas as pd

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

glove_path = "../../Data/glove/glove.6B.100d.txt"  # Path to the GloVe file

In [None]:
def load_glove_embeddings(glove_path):
    embeddings_index = {}
    with open(glove_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    return embeddings_index

In [None]:
embeddings_index = load_glove_embeddings(glove_path)

### Words similarity

The semantic meaning of two words is compared with the cosine similarity.  

In [None]:
def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2)/(np.linalg.norm(embedding1)*np.linalg.norm(embedding2))

def word_similarity(word1, word2, embeddings_index):
    word1 = word1.lower()
    word2 = word2.lower()

    if word1 not in embeddings_index or word2 not in embeddings_index:
        return None

    embedding1 = embeddings_index[word1]
    embedding2 = embeddings_index[word2]
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity

In [None]:
# Let's define a few words we want to analyze with GloVe

word1 = 'man'
word2 = 'woman'

word3 = 'king'
word4 = 'queen'

In [None]:
similarity_1 = word_similarity(word1, word2, embeddings_index)
similarity_2 = word_similarity(word3, word4, embeddings_index)

In [None]:
if similarity_1 is not None:
    print(f'Similarity between "{word1}"-"{word2}": ', end='')
    print("{0:0.3f}. ".format(similarity_1), end='')
    print("\u03F4 = {0:0.1f}° ".format(np.arccos(similarity_1)*180/np.pi))
    print(f'Similarity between "{word3}"-"{word4}": ', end='')
    print("{0:0.3f}. ".format(similarity_2), end='')
    print("\u03F4 = {0:0.1f}° ".format(np.arccos(similarity_2)*180/np.pi))
else:
    print('One or both words are not present in the GloVe embeddings.')

## Words analogy task

How do we know whether the different semantic meaning between two words is similar to that between another pair of words?

This is a relevant information, exploited by text generation engines.

You need to find the word that completes the following proportion:  
_man_ __is to__ _king_ __as__ _woman_ __is to__ [...]



In [None]:
def pair_similarity(pair1, pair2, embeddings_index):
    word1 = pair1[0].lower()
    word2 = pair1[1].lower()
    word3 = pair2[0].lower()
    word4 = pair2[1].lower()

    if word1 not in embeddings_index or word2 not in embeddings_index:
        return None
    if word3 not in embeddings_index or word4 not in embeddings_index:
        return None

    embedding1 = embeddings_index[word1]
    embedding2 = embeddings_index[word2]
    embedding3 = embeddings_index[word3]
    embedding4 = embeddings_index[word4]
    
    # the following vectors play the difference, with regard to word_similarity function
    vec1 = embedding1 - embedding2
    vec2 = embedding3 - embedding4
    pair_similarity = cosine_similarity(vec1, vec2)

    return pair_similarity

In [None]:
# Let's define a few words we want to analyze with GloVe

word1 = 'man'
word2 = 'king'

word3 = 'woman'
word4 = 'queen'

In [None]:
pair_1 = [word1, word2]
pair_2 = [word3, word4]
pair_sim = pair_similarity(pair_1, pair_2, embeddings_index)
print(f'Similarity between "{word1}" - "{word2}" and "{word3}" - "{word4}": ', end='')
print("{0:0.3f}. ".format(pair_sim), end='')
print("\u03F4 = {0:0.1f}° ".format(np.arccos(pair_sim)*180/np.pi))

### Find closest words for any given word

In [None]:
def find_closest_embeddings_cosine(embedding,embeddings_index): 
    return sorted(embeddings_index.keys(), key=lambda word: cosine_similarity(embeddings_index[word], embedding), reverse=True)

In [None]:
input_vec = embeddings_index[word1]
print(find_closest_embeddings_cosine(input_vec,embeddings_index)[0:6])

### Complete analogies

In [None]:
def find_analogy(word1, word2, word3, embeddings_index): 
    '''
    Find x such that (word1 - word2) most similar to (word3 - x)
    '''
    word1 = word1.lower()
    word2 = word2.lower()
    word3 = word3.lower()

    if word1 not in embeddings_index or word2 not in embeddings_index or word3 not in embeddings_index:
        return None

    embedding1 = embeddings_index[word1]
    embedding2 = embeddings_index[word2]
    embedding3 = embeddings_index[word3]

    words = embeddings_index.keys()# all words in dictionary
    max_cosine_sim = -1 
    words_cl = []
    cosine_sim=[]
    results = pd.DataFrame(columns=['words','cosine_sim'])
    for w in words:
        # to avoid best_word being one the input words, skip the input word_c
        # skip word_c from query
        if w in (word1,word2,word3):
            continue
        words_cl.append(w)
        cosine_sim.append(cosine_similarity(embedding1 - embedding2, embedding3 - embeddings_index[w]))
        #cosine_sim.append(cosine_similarity(embedding2 - embedding1, embeddings_index[w] - embedding3)) # it is equivalent
    results = pd.DataFrame({'words':words_cl,'cosine_sim':cosine_sim})
    results = results.sort_values(by='cosine_sim', ascending=False, ignore_index=True)
    return results
    

In [None]:
res = find_analogy('man', 'king', 'woman', embeddings_index)
res.head(1)

### Gender bias 

In [None]:
#the difference between words female and male represents the abstract concept of gender
gender = embeddings_index['woman']-embeddings_index['man']

In [None]:
#the vector representing gender is negatively correlated with male names, and positively with female names: expected
names = ['daniel','james','william', 'jhon', 'emma', 'alice' ,'sophia','charlotte']
for n in names:
    print(n,' = ',cosine_similarity(embeddings_index[n],gender))


In [None]:
#the vector representing gender is correlated with some professions
professions = ['engineer','lawyer', 'warrior','doctor', 'nurse', 'receptionist', 'teacher' ,'singer']
for n in professions:
    print(n,' = ',cosine_similarity(embeddings_index[n],gender))

### Appendix a: alternative similarity computation 

In [None]:
from scipy import spatial
# define (euclidean) distance function 
def find_closest_embeddings(embedding,embeddings_index): 
    return sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], embedding))

In [None]:
triade = ['man','king','woman'] # man:king = woman:?
#triade = ['king','man','queen']
print(find_closest_embeddings(
    embeddings_index[triade[1]] - embeddings_index[triade[0]] + embeddings_index[triade[2]],embeddings_index)[0:6]) #top results are similar but not equal to previous results. We are minimizing two different fucntions

In [None]:
triade = ['she','doctor','he'] 
print(find_closest_embeddings(
    embeddings_index[triade[1]] - embeddings_index[triade[0]] + embeddings_index[triade[2]],embeddings_index)[0:6])

In [None]:
triade = ['he','doctor','she'] 
print(find_closest_embeddings(
    embeddings_index[triade[1]] - embeddings_index[triade[0]] + embeddings_index[triade[2]],embeddings_index)[0:6])

### Appendix b: words distance
Euclidean distance can be an alternative to cosine similarity (but is always positive!)

In [None]:
def word_raw_distance(word1, word2, embeddings_index):
    word1 = word1.lower()
    word2 = word2.lower()

    if word1 not in embeddings_index or word2 not in embeddings_index:
        return None

    embedding1 = embeddings_index[word1]
    embedding2 = embeddings_index[word2]
    distance = np.linalg.norm(embedding1 - embedding2)

    return distance

In [None]:
word = 'cat'
other = ['dog', 'bike', 'kitten', 'puppy', 'kite', 'computer', 'neuron']
for w in other:
    dist = word_raw_distance(word, w, embeddings_index) # euclidean distance
    print(w, float(dist))

In [None]:
input_vec = embeddings_index[word]
find_closest_embeddings(input_vec,embeddings_index)[:6]