In [None]:
import nltk
import numpy as np
import random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
from zipfile import ZipFile
import re


In [None]:
# we use transcripts of sessions of the European parlament as our input. source: http://www.statmt.org/europarl/
zfile = ZipFile('europarl_raw.zip')
fNames = zfile.namelist()
contents = [b''.join(zfile.open(fName).readlines()) for fName in fNames if re.match(r'.*ep.*en',fName)]
rawText = r'\n'.join([string.decode('utf-8') for string in contents])


In [None]:
# now we extract all words and place them in a list, performing some processing steps
# to reduce our vocabulary
# making all words lowercase and removing punctuation
tokens = [word.lower() for word in nltk.wordpunct_tokenize(rawText) if word.isalpha()]
# now we get rid of stop words
# (words that appear often and don't add much to the meaning of a sentence)
from nltk.corpus import stopwords
# READ THIS!!!! If the following line gives you errors, run the line
#nltk.download('stopwords')
stops = set(stopwords.words('english'))
tokens = [token for token in tokens if not token in stops]
numTokens = len(tokens)


In [None]:
# compute the frequency of ocurrence of each word in vocabulary
vocab, vocabFreqs = np.unique(tokens,return_counts=True)
vocabSize = len(vocab)
print("The vocabulary has {} words".format(vocabSize))
idx = vocab.argsort()
vocabFreqs = vocabFreqs[idx] * 1.0/numTokens
# map words to their indices in the vocab list
word2Ind = {word:i for i, word in enumerate(vocab)}
# generate a new array of tokens that maps each word to its index in the dictionary
# Note this is equivalent to a one hot encoding of the words!!
indexedTokens = [word2Ind[token] for token in tokens]


In [None]:
# length of embedding (each word is mapped to a vector of this length)
d = 50
# define the embeddings
seed = 10000
np.random.seed(seed)
centerEmb = 2.0/(d + vocabSize)* np.random.randn(vocabSize, d)
contextEmb = deepcopy(centerEmb).transpose()


# Part C


In [None]:
# the hyperparameter L sets the length of the window we are using
L = 3
eta = 0.001
seed = 100
random.seed(seed)
epbar = tqdm(range(3))
iterable = range(numTokens)
itbar = tqdm(iterable)
for epoch in epbar:
    epbar.set_description("Processing epoch %d" % epoch)
    itbar.refresh()
    itbar.reset()
    for i in iterable:
        itbar.update()
        centerIdx = random.randint(L,numTokens-L-1)
        center = indexedTokens[centerIdx]
        # positive examples -> every word in the window is a context
        for j in range(centerIdx-L,centerIdx+L+1):
            if j == 0 : next
            context = indexedTokens[j]
            ### start code ###

            ### end code ###
        # negative examples -> choose words randomly from the vocab and call them negative examples
        negIdx = random.choices(range(vocabSize),weights=vocabFreqs,k=5)
        for j in negIdx:
            context = j
            e = np.exp(-(centerEmb[[center],:] @ contextEmb[:,[context]])[0][0])
            coeff = - 1.0 / (1 + e)
            centerEmb[[center],:] -= eta * coeff * contextEmb[:,[context]].transpose()
            contextEmb[:,[context]] -= eta * coeff * centerEmb[[center],:].transpose()


In [None]:
# Since we will use cosine distance, we first normalize all the vectors
centerEmb = centerEmb / np.linalg.norm(centerEmb, axis=1,keepdims=True)
contextEmb = contextEmb / np.linalg.norm(contextEmb, axis=0,keepdims=True)


# Part D


In [None]:
word = 'diplomat'
idx = np.where(vocab == word)[0][0]
v = centerEmb[[idx],:]
# find 5 closest words (in angle) to word
a = np.abs(v @ contextEmb)[0]
ind = np.argpartition(a, -5)[-5:]
vocab[ind]


# Part F


In [None]:
# we will introduce a new word in exactly the same contexts as an existing word
origWord = 'rights'
newWord = 'eskubideak' # Basque for rights -> note that we don't even need to use this string to find its embedding
origVocabIdx = word2Ind[origWord] # index of our word in the dictionary

# find all places where the original word appeared in the string
origTokenIndices = np.where(np.array(indexedTokens) == origVocabIdx)[0]

# define the embedding vectors for the new word. We will only train the center embedding
newCenterEmb = 2.0/(d + vocabSize) * np.random.randn(vocabSize, d)


In [None]:
# we train the new vectors
seed = 100
random.seed(seed)
epbar = tqdm(range(3))
iterable = range(len(origTokenIndices))
itbar = tqdm(iterable)
for epoch in epbar:
    epbar.set_description("Processing epoch %d" % epoch)
    itbar.refresh()
    itbar.reset()
    for i in iterable:
        itbar.update()
        centerIdx = random.choice(origTokenIndices)
        # positive examples -> every word in the window is a context
        for j in range(centerIdx-L,centerIdx+L+1):
            if j == 0 : next
            context = indexedTokens[j]
            ### start code ###

            ### end code ###
        # negative examples -> choose words randomly from the vocab and call them negative examples
        negIdx = random.choices(range(vocabSize),weights=vocabFreqs,k=5)
        for j in negIdx:
            context = j
            e = np.exp(-(newCenterEmb @ contextEmb[:,[context]])[0][0])
            coeff = - 1.0 / (1 + e)
            newCenterEmb -= eta * coeff * contextEmb[:,[context]].transpose()


In [None]:
# now find the words closest to the new word
# find 5 closest words (in angle) to word
a = np.abs(newCenterEmb @ contextEmb)[0]
ind = np.argpartition(a, -5)[-5:]
print("word = score: " + ', '.join(['{} = {:.2f}'.format(vocab[i],a[i]) for i in ind]))


# Part G


In [None]:
from sklearn.decomposition import PCA


In [None]:
zfile = ZipFile('glove.6B.50d.zip')
fName = zfile.namelist()[0]
vocab = []
vectors = []
for line in zfile.open(fName).readlines():
    lst = line.split()
    vocab.append(lst[0].decode('utf-8'))
    vector = np.array(lst[1:],dtype=np.float)
    vectors.append(vector)
vectors = np.array(vectors)


In [None]:
# define a map from word to index in vocab array
word2Ind = {word:i for i, word in enumerate(vocab)}


In [None]:
# define list of words
wordList = ['chair', 'banana', 'apple', 'car', 'wheel',
            'table', 'desk', 'building', 'gas']

#
indices = [word2Ind[word] for word in wordList]
theseVecs = vectors[indices,:]
pca = PCA(n_components = 2)
transf = pca.fit_transform(theseVecs)
fig, ax = plt.subplots()
ax.scatter(transf[:,0],transf[:,1],marker='')
ax.set_xlabel('PCA dim 1')
ax.set_ylabel('PCA dim 2')

for i, txt in enumerate(wordList):
    ax.annotate(txt, (transf[i,0], transf[i,1]))

fig.savefig('glove-scatter.png')


# Part H


In [None]:
def tellAnalogy(an1a, an1b, an2b):
    analogy = vectors[[word2Ind[an1a]],:] - \
        vectors[[word2Ind[an1b]], :] + \
        vectors[[word2Ind[an2b]], :]

    distances = np.linalg.norm(vectors - analogy,axis=1)
    idx = np.argmin(distances)
    print('{} is to {} as {} is to {}'.format(an1b, an1a,an2b, vocab[idx]))


# two examples
an1a = 'madrid'
an1b = 'spain'
an2b = 'ghana'
tellAnalogy(an1a,an1b,an2b)

an1a = 'uncle'
an1b = 'nephew'
an2b = 'niece'
tellAnalogy(an1a,an1b,an2b)
