- GloVe is in plain text and trained on relatively small vocabulary than pretrained word2vec
- advantages - parse using python

Link - [Stanford Glove link](https://nlp.stanford.edu/data/glove.6B.zip)

In [None]:
!jupyter labextension disable @jupyterlab/cell-toolbar-extension

In [4]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import os

In [3]:
def dist1(a, b) : #euclidean distance
    return np.linalg.norm(a - b)

def dist2(a, b) :
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [17]:
print('Reading Embeddings...')
glove_embedding = {}
with open(os.path.join('data', 'glove.6B', 'glove.6B.50d.txt'), encoding='utf-8') as f:
    for line in f :
        split = line.split()
        glove_embedding[split[0]] = np.fromiter(split[1:], dtype=np.float32)

print(f'Total Embeddings found : {len(glove_embedding)}')

Reading Embeddings...
Total Embeddings found : 400000


In [29]:
print(f'''The Euclidean distance : {dist1(glove_embedding["king"], glove_embedding["queen"])} 
The Cosine distance : {dist2(glove_embedding["king"], glove_embedding["queen"])}''')

The Euclidean distance : 3.4777565002441406 
The Cosine distance : 0.21609562635421753


In [30]:
print(f'''The Euclidean distance : {dist1(glove_embedding["king"], glove_embedding["water"])} 
The Cosine distance : {dist2(glove_embedding["king"], glove_embedding["water"])}''')

The Euclidean distance : 7.025454521179199 
The Cosine distance : 0.7648409008979797


In [31]:
#just a demo
# Define a collection of data points as a NumPy array or a list of arrays
data_points = [[1, 2, 3], 
               [4, 5, 6], 
               [7, 8, 9]]

# Calculate pairwise distances using a specific metric (e.g., Euclidean distance)
distances = pairwise_distances(data_points, metric='euclidean')

# The 'distances' variable will contain the pairwise distances between data points
print(distances)


[[ 0.          5.19615242 10.39230485]
 [ 5.19615242  0.          5.19615242]
 [10.39230485  5.19615242  0.        ]]


In [128]:
def find_analogies(w1, w2, w3, metric = 'euclidean') :
    #w1 : w2 :: ? : w3
    vector = glove_embedding[w1] + glove_embedding[w3] - glove_embedding[w2]
    #now find all the vectors closest to this vector
    removed_embedding = glove_embedding.copy()
    for w in [w1, w2, w3] :
        removed_embedding.pop(w)
        
    distance_vector = pairwise_distances(X = [vector], Y = list(removed_embedding.values()), metric=metric).reshape(len(removed_embedding))
    idx_r = np.argmin(distance_vector)
    r = list(removed_embedding.keys())[idx_r]
    print(f'{w1} : {w2} :: {r} : {w3}')
    
def nearest_neighbors(w, n = 3, metric = 'euclidean') :
    vector = glove_embedding[w]
    distance_vector = pairwise_distances(X = [vector], Y = list(glove_embedding.values()), metric = metric).reshape(len(glove_embedding))
    idxs = distance_vector.argsort()[1:n+1]
    print(f'Neighbours of {w} : ')
    keys = list(glove_embedding.keys())
    for idx in idxs :
        print(f'{keys[idx]}')

In [118]:
find_analogies('king', 'man', 'woman')

king : man :: queen : woman


In [119]:
find_analogies('france', 'paris', 'london')

france : paris :: britain : london


In [120]:
find_analogies('france', 'paris', 'rome')

france : paris :: italy : rome


In [121]:
find_analogies('france', 'french', 'english')

france : french :: england : english


In [122]:
find_analogies('japan', 'japanese', 'chinese')

japan : japanese :: china : chinese


In [129]:
nearest_neighbors('king', n = 10)

Neighbours of king : 
prince
queen
uncle
ii
grandson
brother
kingdom
son
nephew
elder


In [134]:
nearest_neighbors('king', n = 10, metric='cosine')

Neighbours of king : 
prince
queen
ii
emperor
son
uncle
kingdom
throne
brother
ruler


In [130]:
nearest_neighbors('queen', n= 10)

Neighbours of queen : 
princess
lady
elizabeth
prince
coronation
king
consort
victoria
crown
bride


In [136]:
nearest_neighbors('france', n=10)

Neighbours of france : 
french
belgium
paris
netherlands
spain
italy
switzerland
germany
europe
belgian
