# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load Data

In [2]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Ekkar\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
from nltk.corpus import brown

brown.categories()
corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [4]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
# vocab

In [111]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
# print(word2index)

In [6]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14394


In [7]:
#append UNK
vocab.append('<UNK>')

In [8]:
vocab[:5]

['Clearwater', 'rush', 'Indonesia', 'association', 'slashed']

In [9]:
word2index['<UNK>'] = 0

In [10]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [11]:
from collections import Counter

X_i = Counter(flatten(corpus)) # X_i
X_i

Counter({'The': 806,
         'Fulton': 14,
         'County': 35,
         'Grand': 6,
         'Jury': 2,
         'said': 402,
         'Friday': 41,
         'an': 300,
         'investigation': 9,
         'of': 2849,
         "Atlanta's": 4,
         'recent': 20,
         'primary': 17,
         'election': 38,
         'produced': 6,
         '``': 732,
         'no': 109,
         'evidence': 17,
         "''": 702,
         'that': 802,
         'any': 90,
         'irregularities': 3,
         'took': 47,
         'place': 25,
         '.': 4030,
         'jury': 44,
         'further': 16,
         'in': 1893,
         'term-end': 1,
         'presentments': 1,
         'the': 5580,
         'City': 44,
         'Executive': 6,
         'Committee': 37,
         ',': 5188,
         'which': 244,
         'had': 279,
         'over-all': 2,
         'charge': 17,
         'deserves': 3,
         'praise': 2,
         'and': 2146,
         'thanks': 6,
         'Atlanta': 14,

In [12]:
# Make skip gram of one size window
window_size = 2
# Make skip gram of one size window
skip_grams = []
# loop each word sequence
# we starts from 1 because 0 has no context
# we stop at second last for the same reason
for sent in corpus:
    for i in range(window_size, len(sent) - window_size):
        target = sent[i]

        context = []
        for j in range(1, window_size):
            context.append(sent[i - j])
            context.append(sent[i + j])

        # for each outside word, append to a skip_grams
        for w in context:
            skip_grams.append((target, w))

skip_grams

[('County', 'Fulton'),
 ('County', 'Grand'),
 ('Grand', 'County'),
 ('Grand', 'Jury'),
 ('Jury', 'Grand'),
 ('Jury', 'said'),
 ('said', 'Jury'),
 ('said', 'Friday'),
 ('Friday', 'said'),
 ('Friday', 'an'),
 ('an', 'Friday'),
 ('an', 'investigation'),
 ('investigation', 'an'),
 ('investigation', 'of'),
 ('of', 'investigation'),
 ('of', "Atlanta's"),
 ("Atlanta's", 'of'),
 ("Atlanta's", 'recent'),
 ('recent', "Atlanta's"),
 ('recent', 'primary'),
 ('primary', 'recent'),
 ('primary', 'election'),
 ('election', 'primary'),
 ('election', 'produced'),
 ('produced', 'election'),
 ('produced', '``'),
 ('``', 'produced'),
 ('``', 'no'),
 ('no', '``'),
 ('no', 'evidence'),
 ('evidence', 'no'),
 ('evidence', "''"),
 ("''", 'evidence'),
 ("''", 'that'),
 ('that', "''"),
 ('that', 'any'),
 ('any', 'that'),
 ('any', 'irregularities'),
 ('irregularities', 'any'),
 ('irregularities', 'took'),
 ('took', 'irregularities'),
 ('took', 'place'),
 ('further', 'jury'),
 ('further', 'said'),
 ('said', 'furthe

In [13]:
X_ik_skipgram = Counter(skip_grams) # Co-occurece in window size 2
X_ik_skipgram

Counter({('County', 'Fulton'): 6,
         ('County', 'Grand'): 1,
         ('Grand', 'County'): 1,
         ('Grand', 'Jury'): 1,
         ('Jury', 'Grand'): 2,
         ('Jury', 'said'): 1,
         ('said', 'Jury'): 1,
         ('said', 'Friday'): 4,
         ('Friday', 'said'): 3,
         ('Friday', 'an'): 1,
         ('an', 'Friday'): 1,
         ('an', 'investigation'): 3,
         ('investigation', 'an'): 4,
         ('investigation', 'of'): 4,
         ('of', 'investigation'): 5,
         ('of', "Atlanta's"): 1,
         ("Atlanta's", 'of'): 1,
         ("Atlanta's", 'recent'): 1,
         ('recent', "Atlanta's"): 1,
         ('recent', 'primary'): 1,
         ('primary', 'recent'): 1,
         ('primary', 'election'): 2,
         ('election', 'primary'): 2,
         ('election', 'produced'): 1,
         ('produced', 'election'): 1,
         ('produced', '``'): 1,
         ('``', 'produced'): 1,
         ('``', 'no'): 1,
         ('no', '``'): 1,
         ('no', 'evidence'): 2

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "figures/glove_weighting_func.png" width=400>

In [14]:
#simply a normalized function...don't worry too much
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

In [15]:
from itertools import combinations_with_replacement

X_ik = {}  #for keeping the co-occurences
weighting_dic = {} #scaling the percentage of sampling

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgram.get(bigram) is not None:  #matches 
        co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
        X_ik[bigram] = co_occer + 1 # + 1 for stability issue
        X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

print(f"{X_ik=}")
print(f"{weighting_dic=}")


Exception in callback BaseAsyncIOLoop._handle_events(1316, 1)
handle: <Handle BaseAsyncIOLoop._handle_events(1316, 1)>
Traceback (most recent call last):
  File "c:\Users\Ekkar\anaconda3\lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "c:\Users\Ekkar\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "c:\Users\Ekkar\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 577, in _handle_events
    self._handle_recv()
  File "c:\Users\Ekkar\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 606, in _handle_recv
    self._run_callback(callback, msg)
  File "c:\Users\Ekkar\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 556, in _run_callback
    callback(*args, **kwargs)
  File "c:\Users\Ekkar\anaconda3\lib\site-packages\ipykernel\iostream.py", line 120, in _handle_event
    event_f()
  File "c:\Users\Ekkar\anaconda3\lib\site-package

## 3. Prepare train data

In [16]:
for c in corpus:
    print(c)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']
['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports

In [17]:
import math

def random_batch(batch_size, word_sequence, skip_grams_id, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    # skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [18]:
weighting_dic

{('Clearwater', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'rush'): 0.03162277660168379,
 ('rush', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'Indonesia'): 0.03162277660168379,
 ('Indonesia', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'association'): 0.03162277660168379,
 ('association', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'slashed'): 0.03162277660168379,
 ('slashed', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', "writers'"): 0.03162277660168379,
 ("writers'", 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'miscount'): 0.03162277660168379,
 ('miscount', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'forecasts'): 0.03162277660168379,
 ('forecasts', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'Griffith-Jones'): 0.03162277660168379,
 ('Griffith-Jones', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'bumper'): 0.03162277660168379,
 ('bumper', 'Clearwater'): 0.03162277660168379,
 ('Clearwater', 'sacker'): 0.03162277660

In [19]:
skip_grams

[('County', 'Fulton'),
 ('County', 'Grand'),
 ('Grand', 'County'),
 ('Grand', 'Jury'),
 ('Jury', 'Grand'),
 ('Jury', 'said'),
 ('said', 'Jury'),
 ('said', 'Friday'),
 ('Friday', 'said'),
 ('Friday', 'an'),
 ('an', 'Friday'),
 ('an', 'investigation'),
 ('investigation', 'an'),
 ('investigation', 'of'),
 ('of', 'investigation'),
 ('of', "Atlanta's"),
 ("Atlanta's", 'of'),
 ("Atlanta's", 'recent'),
 ('recent', "Atlanta's"),
 ('recent', 'primary'),
 ('primary', 'recent'),
 ('primary', 'election'),
 ('election', 'primary'),
 ('election', 'produced'),
 ('produced', 'election'),
 ('produced', '``'),
 ('``', 'produced'),
 ('``', 'no'),
 ('no', '``'),
 ('no', 'evidence'),
 ('evidence', 'no'),
 ('evidence', "''"),
 ("''", 'evidence'),
 ("''", 'that'),
 ('that', "''"),
 ('that', 'any'),
 ('any', 'that'),
 ('any', 'irregularities'),
 ('irregularities', 'any'),
 ('irregularities', 'took'),
 ('took', 'irregularities'),
 ('took', 'place'),
 ('further', 'jury'),
 ('further', 'said'),
 ('said', 'furthe

In [None]:
#testing the method
batch_size = 2 # mini-batch size
skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams_id, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[13377]
 [  624]]
Target:  [[ 9539]
 [14283]]
Cooc:  [[1.09861229]
 [0.69314718]]
Weighting:  [[0.07208434]
 [0.05318296]]


## 4. Model

<img src ="figures/glove.png">

In [92]:
class GloVe(nn.Module):
    
    def __init__(self, voc_size,emb_size, word2index):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(voc_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(voc_size, emb_size) # out embedding
        
        self.v_bias = nn.Embedding(voc_size, 1)
        self.u_bias = nn.Embedding(voc_size, 1)

        self.word2index = word2index
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_v(word)
        embed_o = self.embedding_u(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

## 5. Training

In [93]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = GloVe(voc_size, embedding_size, word2index)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [94]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [95]:
import time

# Training
num_epochs = 1000
start = time.time()
for epoch in range(num_epochs):
    
    # start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams_id, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

print(f"\nComplete: \nTotal Loss: {loss:2.2f} | Time Taken: {epoch_mins} minutes and {epoch_secs} seconds")



Epoch: 100 | cost: 51.588993 | time: 0m 2s
Epoch: 200 | cost: 3.067827 | time: 0m 4s
Epoch: 300 | cost: 12.578943 | time: 0m 6s
Epoch: 400 | cost: 13.456717 | time: 0m 8s
Epoch: 500 | cost: 63.927670 | time: 0m 11s
Epoch: 600 | cost: 6.131532 | time: 0m 13s
Epoch: 700 | cost: 4.805292 | time: 0m 15s
Epoch: 800 | cost: 73.498535 | time: 0m 17s
Epoch: 900 | cost: 42.829739 | time: 0m 19s
Epoch: 1000 | cost: 22.812899 | time: 0m 21s

Complete: 
Total Loss: 22.81 | Time Taken: 0 minutes and 21 seconds


## 6. Testing

In [96]:
def open_file(path_to_file):
    content = []  # Initialize content to an empty list to avoid returning None
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()  # Read all lines of the file into a list
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")  # File not found error
    except Exception as e:
        print(f"An error occurred: {e}")  # Handle any other exceptions (e.g., permission issues)

    return content  # Return content even if it's empty, but not None


In [97]:
file_path = "file/word-test.v1.1.txt"

content = open_file(file_path)

semantic = []
syntatic = []

current_test = semantic
for sent in content:
    if sent[0] == ':':
        current_test = syntatic
        continue
    
    current_test.append(sent.strip())

In [98]:
vector_space = []

for word in vocab:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [99]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    norm_a = a / np.linalg.norm(a)  # Normalize vector a
    norm_b = b / np.linalg.norm(b)  # Normalize vector b
    return 1 - spatial.distance.cosine(norm_a, norm_b)  # Cosine similarity after normalization


def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

def similarity(model, test_data):
    words = test_data.split(" ")
    embeddings = [np.array(model.get_embed(word)) for word in words[:3]]  # Precompute embeddings for all words
    embed0, embed1, embed2 = embeddings  # Unpack embeddings
    similar_vector = embed1 - embed0 + embed2  # Perform vector arithmetic

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    return similar_word == words[3]  # Directly return the result

Semantic accuracy

In [100]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


Syntactic Accuracy

In [101]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


Similarity Accuracy

In [102]:
file_path = "file/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [103]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [104]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [105]:
from scipy.stats import spearmanr

correlation = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset similarity metrics and models’ dot product is {correlation:2.2f}.")

Correlation between the dataset similarity metrics and models’ dot product is 0.13.


## 7. Save model

In [108]:
import torch
import pickle

# Define the folder where want to save the files
model_folder = 'model'  # Change this to your desired folder path

# Save the model's state_dict
torch.save(model.state_dict(), f'{model_folder}/glove.model')

# Save the arguments (such as voc_size, emb_size, word2index)
glove_args = {
    'voc_size': voc_size,
    'emb_size': embedding_size,
    'word2index': word2index,
}
with open(f'{model_folder}/glove.args', 'wb') as f:
    pickle.dump(glove_args, f)

print(f"Model and arguments saved to {model_folder}")


Model and arguments saved to model


In [109]:
import torch
import pickle

# Define the folder where the files are saved
model_folder = 'model'  # Change this to the folder where you saved the files

# Load the arguments from the pickle file
with open(f'{model_folder}/glove.args', 'rb') as f:
    glove_args = pickle.load(f)

# Define the model class and initialize it with the loaded arguments
# Make sure the model class and arguments match the training code
model_glove = GloVe(**glove_args)  # Assuming you have a GloVe model class

# Now, load the model weights (this should be from glove.model, not glove.args)
model_glove.load_state_dict(torch.load(f'{model_folder}/glove.model'))

# Now the model is loaded with the arguments and weights, and you're ready to use it
model_glove.eval()  # Set the model to evaluation mode if you're not training

print("Model loaded successfully.")


Model loaded successfully.


  model_glove.load_state_dict(torch.load(f'{model_folder}/glove.model'))


In [110]:
model_glove.get_embed('sad')

(-1.0061289072036743, -1.2175040245056152)