In [None]:
# Imports
import requests
import nltk
nltk.download('punkt')
from IPython.display import display, Markdown
import random, more_itertools
! pip install sparse
from scipy import sparse
import numpy as np
from collections import defaultdict
from random import shuffle
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting sparse
[?25l  Downloading https://files.pythonhosted.org/packages/e3/82/d58361f8107e8686196b91319edf2c26490667b8340cc229b668ee7a1582/sparse-0.11.2-py2.py3-none-any.whl (73kB)
[K     |████████████████████████████████| 81kB 6.7MB/s 
[?25hCollecting numba>=0.49
[?25l  Downloading https://files.pythonhosted.org/packages/48/78/31f620c3469287f4255d9a1054bee713cd3596fda2711c392ce3021b3c98/numba-0.51.2-cp36-cp36m-manylinux2014_x86_64.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 20.1MB/s 
Collecting llvmlite<0.35,>=0.34.0.dev0
[?25l  Downloading https://files.pythonhosted.org/packages/f3/b7/8a91b513f165e0affdeb975c1fef307c39d1051ce71e8aec1da9dcb317ad/llvmlite-0.34.0-cp36-cp36m-manylinux2010_x86_64.whl (24.6MB)
[K     |████████████████████████████████| 24.6MB 130kB/s 
[?25hInstalling collected packages: llvmlite, numba, sparse
  Found existing installation: 

In [None]:
#Downloading the Corpus
en_url_down = "https://drive.google.com/uc?export=download&id=1H3cNxmsG8k79Vr3FkSa0hkLcC2AGIxSy"
response = requests.get(en_url_down)
en_data = response.text
en_data = en_data.replace('\n', '')

In [None]:
# tokenising the english corpus using NLTK
en_tokens = word_tokenize(en_data)
display(Markdown("###Number of tokens after performing word tokenisation: {}".format( str(len(en_tokens)))))
display(Markdown("###Some examples of tokens are:"))
print(en_tokens[0:10])
del en_data

###Number of tokens after performing word tokenisation: 19183786

###Some examples of tokens are:

['The', 'word', '``', 'atom', "''", 'was', 'coined', 'by', 'ancient', 'Greek']


# Building vocabulary

In [None]:
# function to build the voab out of training data
# will return a dictionary of format {word: (id,frequency)}
def build_vocab(tokens):
    tokensfrequency = {}
    for token in tokens:
        if token not in tokensfrequency:
            tokensfrequency[token] = 1
        else:
            tokensfrequency[token] += 1
    # At the moment taking only 5000 unique tokens2
    sorted_en_tokens = sorted(tokensfrequency.items(), key=lambda x: x[1], reverse=True)[:5000]

    vocab = {}
    i=0
    for token, fre in sorted_en_tokens:
        vocab[token] = (i, fre)
        i += 1
    del tokensfrequency
    return vocab

vocab = build_vocab(en_tokens)

# Cooccurence Matrix

In [None]:
# function to build the co-occurence matrix given window size
# will return a matrix with each element cooccurence_matrix[(i,j)] = weight, where i is the main word and j is the context word
def build_cooccurence_matrix(tokens, vocab, window_size):
    id2token = {id:token for token, (id, fre) in vocab.items()}
    token2id = {token:id for token, (id, fre) in vocab.items()}
    
    cooccurence_matrix = defaultdict(lambda: 0)
    
    index = 0
    for token in tokens:
        
        # take this token as center_token i.e. main word
        center_token = token
        
        # find all the context words 
        left_tokens = tokens[max(0, index-window_size) :index]
        right_tokens = tokens[index+1:min(len(tokens), window_size+index+1)]
        
        # temp_dis = window_size
        for left_token in left_tokens:
            # given in paper
            # distance = 1/float(temp_dis)
            # temp_dis -= 1
            if token in vocab and left_token in vocab:
                cooccurence_matrix[(token2id[center_token], token2id[left_token])] += 1
        
        # temp_dis = 1
        for right_token in right_tokens:
            # distance = 1/float(temp_dis)
            # temp_dis += 1
            if token in vocab and right_token in vocab:
                cooccurence_matrix[(token2id[center_token], token2id[right_token])] += 1
        
        index += 1

    return cooccurence_matrix

In [None]:
cooccurence_matrix = build_cooccurence_matrix(en_tokens, vocab, 4)

# Training GLoVE embeddings

In [None]:
# main function to train GLoVe embeddings
def train(vocab, cooccurence_matrix, vector_size, epochs, alpha, x_max, learning_rate):
    total_tokens = len(vocab)
    # each token will have two word vectors each with dimension vector_size
    # one in which it is the main word and in the other it is the context word
    W_main = (np.random.randn(total_tokens, vector_size) - 0.5) / float(vector_size)
    W_context = (np.random.randn(total_tokens, vector_size) - 0.5)/float(vector_size)

    bias_main = (np.random.randn(total_tokens) - 0.5)/float(vector_size)
    bias_context = (np.random.randn(total_tokens) - 0.5)/float(vector_size)
    
    costs = []
    for i in range(epochs):
        # call iterate function to optimize the weight matrices
        print("Iteration: %i", i)
        cost, W_main, W_context, bias_main, bias_context  = run_iter(vocab,cooccurence_matrix,  W_main, W_context, bias_main, bias_context, alpha, x_max, learning_rate, vector_size)
        costs.append(cost)
        print("Cost: %f", cost)
    return costs, W_main, W_context, bias_main, bias_context

In [None]:
# In each iteration compute the cost and change the weights according 
# to the adagrad optimization
def run_iter(vocab, cooccurence_matrix, W_main, W_context, bias_main, bias_context, alpha, x_max, learning_rate, vector_size):
    total_tokens = len(vocab)

    gradient_matrix_W_main = np.ones((total_tokens, vector_size), dtype = np.float64)
    gradient_matrix_W_context = np.ones((total_tokens, vector_size), dtype = np.float64)
    gradient_matrix_bias_main = np.ones(total_tokens, dtype = np.float64)
    gradient_matrix_bias_context = np.ones(total_tokens, dtype = np.float64)

    total_cost = 0
    for w_main, (i, _) in vocab.items():
        for w_context, (j, _) in vocab.items():
            if (i, j) in cooccurence_matrix:

                cooccurence = cooccurence_matrix[(i, j)]
                f_x = (cooccurence/x_max) ** alpha if cooccurence < x_max else 1
            
                cost = f_x * ((W_main[i]).dot(W_context[j]) + bias_main[i] + bias_context[j] - np.log(cooccurence)) ** 2
                
                # this is done for the ease of computation
                total_cost += 0.5 * cost        

                gradient_w_main = f_x * ((W_main[i]).dot(W_context[j]) + bias_main[i] + bias_context[j] - np.log(cooccurence)) * W_context[j]
                gradient_w_context = f_x * ((W_main[i]).dot(W_context[j]) + bias_main[i] + bias_context[j] - np.log(cooccurence)) * W_main[i]

                gradient_bias_main = f_x * ((W_main[i]).dot(W_context[j]) + bias_main[i] + bias_context[j] - np.log(cooccurence))
                gradient_bias_context = f_x * ((W_main[i]).dot(W_context[j]) + bias_main[i] + bias_context[j] - np.log(cooccurence))


                W_main[i] = W_main[i] - (learning_rate*gradient_w_main)/np.sqrt(gradient_matrix_W_main[i])
                W_context[j] = W_context[j] - (learning_rate*gradient_w_context)/np.sqrt(gradient_matrix_W_context[j])

                bias_main[i] = bias_main[i] - (learning_rate*gradient_bias_main)/np.sqrt(gradient_matrix_bias_main[i])
                bias_context[j] = bias_context[j] - (learning_rate*gradient_bias_context)/np.sqrt(gradient_matrix_bias_context[j])

                gradient_matrix_W_main[i] += np.square(gradient_w_main)
                gradient_matrix_W_context[j] += np.square(gradient_w_context)
                gradient_matrix_bias_main[i] += gradient_bias_main ** 2
                gradient_matrix_bias_context[j] += gradient_bias_context ** 2
    
    return total_cost, W_main, W_context, bias_main, bias_context


def exists(obj, chain):
    _key = chain.pop(0)
    if _key in obj:
        return exists(obj[_key], chain) if chain else obj[_key]

In [None]:
WINDOW_SIZE = 5
NUM_EPOCHS = 25
VECTOR_SIZE = 100
alpha = 0.75
x_max = 100
learning_rate = 0.001

In [None]:
costs, W_main, W_context, bias_main, bias_context =  train(vocab, cooccurence_matrix, VECTOR_SIZE, NUM_EPOCHS, alpha, x_max, learning_rate)

# Spearman’s rank correlation corfficient 

In [None]:
# downloading the datasets
from google.colab import drive
drive.mount('/content/drive')
! cp -R '/content/drive/My Drive/web' ./

(5000, 100)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from six import iteritems
w_glove = fetch_GloVe(corpus="wiki-6B", dim=100)


Dataset created in /root/web_data/embeddings

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip ...


100%|██████████| 862M/862M [06:28<00:00, 2.22Mb/s]


...done. (388 seconds, 6 min)
Extracting data from /root/web_data/embeddings/glove.6B/glove.6B.zip...
   ...done.


In [None]:
# This particular section is adapted from 
# https://github.com/kudkudak/word-embeddings-benchmarks/blob/master/examples/evaluate_similarity.py
# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353()
}

# storing W_main vectors in vocab_vector
vocab_vector = {}
for word, (id, _) in vocab.items():
    vocab_vector[word] = W_main[id]


subset_WS353 = [[],[]]
name = "WS353"
for i in range(len(tasks[name].X)):
    # taking only the subset of dataset 
    # i.e. the tokens frrom W3S53 which are also present in our vocab
    if tasks[name].X[i][0] in vocab_vector and tasks[name].X[i][1] in vocab_vector:
        subset_WS353[0].append([tasks[name].X[i][0], tasks[name].X[i][1]])
        subset_WS353[1].append(tasks[name].y[i])

subset_WS353[0] = np.array(subset_WS353[0])
subset_WS353[1] = np.array(subset_WS353[1])

subset_MEN = [[], []]
name = "MEN"

for i in range(len(tasks[name].X)):
    # taking only the subset of dataset 
    # i.e. the tokens frrom W3S53 which are also present in our vocab
    if tasks[name].X[i][0] in vocab_vector and tasks[name].X[i][1] in vocab_vector:
        subset_MEN[0].append([tasks[name].X[i][0], tasks[name].X[i][1]])
        subset_MEN[1].append(tasks[name].y[i][0])

subset_MEN[0] = np.array(subset_MEN[0])
subset_MEN[1] = np.array(subset_MEN[1])

In [None]:
print ("Spearman correlation of our model of scores on {} {}".format("WS353", evaluate_similarity(vocab_vector, subset_WS353[0], subset_WS353[1])))
print ("Spearman correlation of Stanford of scores on {} {}".format("WS353", evaluate_similarity(w_glove, subset_WS353[0], subset_WS353[1])))
print ("Spearman correlation of our model scores on {} {}".format("MEN", evaluate_similarity(vocab_vector, subset_MEN[0], subset_MEN[1])))
print ("Spearman correlation of stanfords scores on {} {}".format("MEN", evaluate_similarity(w_glove, subset_MEN[0], subset_MEN[1])))

  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
Missing 10 words. Will replace them with mean vector
Spearman correlation of our model of scores on WS353 0.114336386249 
Spearman correlation of Stanford of scores on WS353 0.57896566434124226 
Spearman correlation of our model scores on MEN 0.1335435436359873 
Spearman correlation of stanfords scores on MEN 0.68465287102879
