In [1]:
import torch
import time
import pandas as pd
import numpy as np
import re
import gensim
import collections
import nltk
from nltk import word_tokenize
from stopwordsallforms import STOPWORDS as arb_stopwords
import elements
#nltk.download("stopwords")
nltk.download('punkt')
import pandas as pd

[nltk_data] Downloading package punkt to /home/amr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data
### Helping Functions

In [2]:
def normalization(t):
    return t.translate(t.maketrans(''.join(elements.ALEF_HAMZA_FORMS+elements.NON_ALIF_HAMZA_FORMS+tuple(elements.ALIF_MAQSURA)),"اااويي"))

In [3]:
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [4]:
arb_stopwords = [clean_str(normalization(i)) for  i in set(arb_stopwords.keys())]

In [5]:
def stopwords_removal(t,stop=arb_stopwords):
    return ' '.join([word for word in word_tokenize(t) if word not in stop])

### Data

In [6]:
data= np.load('../translation project/AD_NMT-master/LAV-MSA-2-both.pkl',allow_pickle=True)

In [7]:
data[0] # lav , msa

['لا انا بعرف وحدة راحت ع فرنسا و معا شنتا حطت فيها الفرش',
 'لا اعرف واحدة ذهبت الى فرنسا و لها غرفة و ضعت فيها الافرشة']

In [8]:
# separate two types of text
lav=[]
msa=[]
for i,ex in enumerate(data):
    lav_text = stopwords_removal(clean_str(normalization(ex[0])))
    msa_text = stopwords_removal(clean_str(normalization(ex[1])))
    data[i][0],data[i][1] = lav_text,msa_text
    lav.append(lav_text)
    msa.append(msa_text)

In [9]:
lav = ' '.join(lav)
msa = ' '.join(msa)

Dictionaries

In [10]:
msa_d=collections.Counter(msa.split())
lav_d=collections.Counter(lav.split())

In [11]:
min_count = 2

In [12]:
idx2msa = np.array([word for word,freq in msa_d.items() if freq > min_count ])
idx2lav = np.array([word for word,freq in lav_d.items() if freq > min_count ])

In [13]:
msa2idx = {word:i for i,word in enumerate(idx2msa)}
lav2idx = {word:i for i,word in enumerate(idx2lav)}

In [14]:
msa_data = [' '.join([i for i in t[1].split() if (msa2idx.get(i,-1) != -1)]) for t in data]
lav_data = [' '.join([i for i in t[0].split() if (lav2idx.get(i,-1) != -1)]) for t in data]

In [15]:
#generate 50 random integer
np.random.seed(42)
random_index = np.random.randint(0,min(len(lav2idx),len(msa2idx)), 50)

In [16]:
src_msa = idx2msa[random_index]

#### Stem similarity

In [17]:
st = nltk.ISRIStemmer()

In [18]:
stem_start = time.time()
sim_lav = []
sim_msa = []
sim=[]
alone_msa = []
for l,m in zip(lav_data,msa_data): #loop over data
    msa_words,lav_words=m.split(),l.split() 
    stems=[st.stem(i) for i in lav_words]
    
    for c,j in enumerate(msa_words):
        msa_stem = st.stem(j)
        if msa_stem in stems:
            lav_idx = stems.index(msa_stem)
            sim.append((j,lav_words[lav_idx])) #append pair
            sim_lav.append(lav_words[lav_idx]) 
            sim_msa.append(j)
stem_end = time.time()
stem_time = stem_end-stem_start

In [19]:
msa2lav_stem = {i:v for i,v in sim}

In [20]:
sim_res = []
for i in idx2msa[random_index]:
    sim_res.append((i,msa2lav_stem.get(i,'None')))

In [21]:
stem_lav = [i[1] for i in sim_res]

## Second Approach: Coocurence Matrix
In order to get word pairs from lav to msa we try coocurence matric approach

In [22]:
cooc = np.zeros((len(idx2msa), len(idx2lav) ))

In [23]:
cooc.shape

(6118, 5272)

In [24]:
cooc_start= time.time()
for l,m in zip(lav_data,msa_data):
    l,m = l.split(),m.split()
    lav_count = collections.Counter(l)
    msa_count = collections.Counter(m)
    for k_lav,v_lav in lav_count.items():    
        for k_msa,v_msa in msa_count.items():
            cooc[ msa2idx[k_msa] , lav2idx[k_lav] ] += (min(v_msa,v_lav)/max(v_lav,v_msa))
cooc_end= time.time()
cooc_time = cooc_end-cooc_start

In [25]:
cooc_match = np.argmax(cooc[random_index],axis=1)

In [26]:
cooc_res = []
for msa_idx,lav_idx in zip(random_index,cooc_match):
    cooc_res.append((idx2msa[msa_idx],idx2lav[lav_idx]))

In [27]:
cooc_lav = [i[1] for i in cooc_res]

## Third approach : word vector similarity

In [28]:
#Count occurence of each word per text
lav_d_list=[]
msa_d_list=[]
for lav,msa in zip(lav_data,msa_data):
    lav_d_list.append(collections.Counter(lav.split()))
    msa_d_list.append(collections.Counter(msa.split()))

In [29]:
def create_word_vector(ds_freq:dict,word:str):
    wv=[]
    #l: Counter dictionary
    for j,l in enumerate(ds_freq):
        wv.append(l.get(word,0))
    return wv

In [30]:
def create_ds_word_vectors(dataset:dict,ds_d_list:dict):
    dataset_wv = []
    for i,w in enumerate(dataset.keys()): #loop over words
        wv = create_word_vector(ds_d_list,w)
        dataset_wv.append(wv)
    return dataset_wv

In [31]:
msa_word_vector=create_ds_word_vectors(msa2idx,msa_d_list)

In [32]:
lav_word_vector=create_ds_word_vectors(lav2idx,lav_d_list)

In [33]:
#Convert list of lists to matrix 
m_vec = np.stack(msa_word_vector)
l_vec = np.stack(lav_word_vector)

In [34]:
def vectorized_cos_sim_cuda(X,Y):
    X,Y = torch.from_numpy(X).type(torch.float).cuda(),torch.from_numpy(Y).type(torch.float).cuda()
    mod_v1 = torch.linalg.norm(X,axis=-1)
    mod_v2 = torch.linalg.norm(Y,axis=-1)
    mod = torch.matmul(mod_v1.view(-1,1),mod_v2.view(1,-1))
    dot = torch.matmul(X,Y.T)
    return (dot/mod).cpu().numpy()

In [35]:
def vectorized_cos_sim(X,Y):
    mod_v1 = np.linalg.norm(X,axis=-1)
    mod_v2 = np.linalg.norm(Y,axis=-1)
    mod = np.dot(mod_v1.reshape(-1,1),mod_v2.reshape(1,-1))

    dot = np.dot(X,Y.T)
    return (dot/mod)

In [36]:
sim_start=time.time()
sim_matrix = vectorized_cos_sim_cuda(m_vec,l_vec)
cos_sim_match = np.argmax(sim_matrix,axis=1)
sim_end=time.time()
cos_sim_time = sim_end-sim_start

In [37]:
cos_sim_res = []
for msa_idx in random_index:
    lav_idx = cos_sim_match[msa_idx]
    cos_sim_res.append((idx2msa[msa_idx],idx2lav[lav_idx]))

In [38]:
cos_sim_lav = [i[1] for i in cos_sim_res]

## Fourth Approach: K-nearest neighbours
 <ol>
  <li>get word emb </li>
  <li>document vector and ind2doc dictionary </li>
  <li>calculate number of planes and number of universes </li>
  <li>implment hash puckets</li>
  <li>creat hash tables and universes</li>
  <li>approximate knn universes</li>
</ol> 

Following the rules of the assignment, let us say that in average we want to have 128 example in each index, therefore we need 

In [39]:
examples_per_index = 128
num_of_planes = np.ceil(np.log2(len(data)/examples_per_index)).astype(int)
num_of_planes

7

In [40]:
num_of_universes=20
n_dim = l_vec.shape[1]

Hash puckets

In [41]:
planes = np.random.normal(0,1,(num_of_universes,n_dim,num_of_planes))

In [42]:
def matrix_hash(mat,planes):
    num_of_universes,_,num_planes = planes.shape
    num_puckets=2**num_planes

    temp = np.dot(mat,planes) #num_words x num_of_universes x num_planes
    temp = np.sign(temp) == 1
    
    pow_2 = np.power(2,np.arange(num_planes))
    
    #Get the place where the word will be stored at
    hash_index = np.dot(temp,pow_2) #num_words x num_of_universes
    
    #Create dictionaries to store data in
    hash_table = []
    id_table = []    
    for i in enumerate(range(num_of_universes)):
        hash_table.append({i:[] for i in range(num_puckets)})
        id_table.append({i:[] for i in range(num_puckets)})
    #Fill dictionaries
    for universe,hashes in enumerate(hash_index.T):
        for i,h in enumerate(hashes):
            hash_table[universe][h].append(mat[i])
            id_table[universe][h].append(i)
        
    return hash_table ,id_table ,hash_index

In [43]:
#hash_index has the size of given list, contains the index of each word
lav_hash2vec,lav_hash2id,_ = matrix_hash(l_vec,planes)

In [44]:
#hash_index has the size of given list, contains the index of each word
_,_,msa_id2hash = matrix_hash(m_vec,planes)

There are words that existed multiple times for the same word.
<br>so, we will ignore the words that point to each other.

#### Apply k-nearsest negihbours
 <ol>
  <li>pass over each word and get its ID index</li>
  <li>get the contents from l_id_table</li>
  <li>Stack the vectors existed in l_id_table from l_hash_table</li>
  <li>Perform cosine similarity </li>
  <li>Get the largest value index</li>
  <li>define its id</li>
  <li>store the word in a list</li>
</ol> 

In [45]:
def most_frequent(List):
    return max(set(List), key = List.count)

In [46]:
def search_per_word(X,key_id2hash,value_hash2vec,value_hash2id):
    lav_idx_per_word=[]
    #candidate similarities in each universe
    for UNIVERSE,pucket_id in enumerate(key_id2hash): 
        #extract lav hash table of universe number k
        y_vec = value_hash2vec[UNIVERSE][pucket_id]
        if len(y_vec) ==0:
            continue
        Y = np.stack(y_vec)
        #calculate similarity between msa vector and k neghibours in lav id 
        sim = vectorized_cos_sim(X,Y)
        max_idx = np.argmax(sim)
        #Get its index from id_table
        l_word_idx = value_hash2id[UNIVERSE][pucket_id][max_idx]
        lav_idx_per_word.append(l_word_idx)
    return most_frequent(lav_idx_per_word)

In [47]:
def search_per_list(k_vec,random_index,key_id2hash,value_hash2vec,value_hash2id):
    pair = []
    #size numWords x n_dim
    key_hash_list = key_id2hash[random_index]
    
    for i,value_ids in enumerate(key_hash_list):
        X = k_vec[random_index[i]]
        
        k_neghibours_match = search_per_word(X,value_ids,value_hash2vec=value_hash2vec,value_hash2id=value_hash2id)
        pair.append((random_index[i],k_neghibours_match))
    return pair

In [48]:
k_n_start = time.time()
kn_res = search_per_list(m_vec,random_index,msa_id2hash,lav_hash2vec,lav_hash2id)
k_n_end = time.time()
k_n_time = k_n_end - k_n_start

In [49]:
k_n_lav = [idx2lav[i[1]] for i in kn_res]

## Fifth Approach: Language Model

In [50]:
import gensim
import gensim.models
from gensim import utils

In [51]:
class MyCorpus:
    def __init__(self,ds):
        self.ds = ds
    """An iterator that yields sentences (lists of str)."""
    def __iter__(self):
        for line in self.ds:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [52]:
lav_corp = MyCorpus(lav_data)
msa_corp = MyCorpus(msa_data)

In [53]:
m_lav = gensim.models.Word2Vec(epochs=10,sentences=lav_corp,min_count=0,vector_size=300)
m_msa = gensim.models.Word2Vec(epochs=10,sentences=msa_corp,min_count=0,vector_size=300)

In [54]:
m_msa.wv.most_similar('الله'),m_lav.wv.most_similar('الله')

([('خير', 0.9969905614852905),
  ('السلامه', 0.9969214200973511),
  ('خيرا', 0.9965105056762695),
  ('نلتقي', 0.9962499737739563),
  ('البكالوريا', 0.9956582188606262),
  ('نكمل', 0.9956363439559937),
  ('قريب', 0.9951711297035217),
  ('القي', 0.9949621558189392),
  ('واتخلص', 0.9947621822357178),
  ('هدي', 0.9943655729293823)],
 [('خير', 0.9967350959777832),
  ('تصبحوا', 0.9945842027664185),
  ('قدر', 0.9927830100059509),
  ('قريب', 0.9921440482139587),
  ('تصبحي', 0.9918696880340576),
  ('امان', 0.9917701482772827),
  ('ايامك', 0.9914921522140503),
  ('انشاء', 0.9914281368255615),
  ('يسامحك', 0.9909295439720154),
  ('السلامه', 0.9907274842262268)])

In [55]:
lm_start = time.time()
sim = vectorized_cos_sim(m_msa.wv.vectors,m_lav.wv.vectors)
lm_end = time.time()
lm_time = lm_end - lm_start

In [56]:
lm_match = np.argmax(sim,axis=1)

In [57]:
lm_res = []
for msa_idx in random_index:
    lav_idx = lm_match[msa_idx]
    lm_res.append((idx2msa[msa_idx],idx2lav[lav_idx]))

In [58]:
lm_lav = [i[1] for i in lm_res]

# Comparison

In [59]:
df = pd.DataFrame()

In [60]:
src_msa = list(src_msa)
src_msa.append('time')

In [61]:
stem_lav.append(stem_time)
cooc_lav.append(cooc_time)
cos_sim_lav.append(cos_sim_time)
k_n_lav.append(k_n_time)
lm_lav.append(lm_time)

In [62]:
df['msa'] = src_msa
df['stem_sim'] = stem_lav
df['cooc'] = cooc_lav
df['cos_sim'] = cos_sim_lav
df['k-neighbours'] = k_n_lav
df['word2vec'] = lm_lav

In [63]:
df.set_index('msa')

Unnamed: 0_level_0,stem_sim,cooc,cos_sim,k-neighbours,word2vec
msa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
بهاء,بهاء,بهاء,بهاء,بهاء,اتركيه
اثق,,ممكن,ممكن,الجزاير,بتعيش
احسنت,احسنت,احسنت,احسنت,احسنت,للرجال
مامعني,,كلمه,سلسبيل,سلسبيل,بتقع
نمزح,,عم,احنا,عشره,تشتغلي
انهض,,قوم,قوم,قوم,بتدافع
نوره,نوره,نوره,نوره,نوره,استوي
جمعت,جمعت,جمعت,جمعت,جمعت,@
خايف,خايف,خايف,القرد,خايفه,نزلت
طبيعه,طبيعه,طبيعه,طبيعه,طبيعه,الحقد


In [67]:
df2 = pd.DataFrame(index=df.columns[1:])

In [70]:
df2['durations'] = [stem_time,cooc_time,cos_sim_time,k_n_time,lm_time]
df2['accuracy'] = np.array([24,25,30,24,0]) / 50

In [71]:
df2

Unnamed: 0,durations,accuracy
stem_sim,0.584471,0.48
cooc,0.460136,0.5
cos_sim,2.707899,0.6
k-neighbours,3.756567,0.48
word2vec,0.193292,0.0


# Computation
## 1.1 Generate embedding and transform matrices
We will use the word pair generated by cosine similarity and the embeddings generated by word2vec.  

In [68]:
word_pairs = [(i,idx2lav[ret[j]]) for i,j in msa2idx.items()]

In [70]:
word_pairs

[('اعرف', 'بعرف'),
 ('واحده', 'وحده'),
 ('ذهبت', 'رحت'),
 ('فرنسا', 'فرنسا'),
 ('غرفه', 'اوضه'),
 ('وضعت', 'حطيت'),
 ('اذهب', 'روح'),
 ('تقدم', 'ماشيه'),
 ('يسارا', 'عاليسار'),
 ('يجب', 'لازم'),
 ('يكون', 'يكون'),
 ('موضوع', 'موضوع'),
 ('تبتعد', 'تبعد'),
 ('تفقد', 'تبعد'),
 ('الطريق', 'الطريق'),
 ('اقصد', 'قصدي'),
 ('صراحه', 'صراحه'),
 ('ياامي', 'يما'),
 ('ايضا', 'كمان'),
 ('كرهته', 'وصرت'),
 ('يوم', 'يوم'),
 ('حضرته', 'حضرته'),
 ('مثلما', 'بلح'),
 ('خالتي', 'خالتي'),
 ('قال', 'قالي'),
 ('موضوعك', 'موضوعك'),
 ('جيد', 'منيح'),
 ('ساقول', 'حاقولك'),
 ('كنت', 'كنت'),
 ('اردت', 'بدي'),
 ('اخوتي', 'طيبه'),
 ('اشتري', 'اشتري'),
 ('الطيبه', 'صدقه'),
 ('اخر', 'تاني'),
 ('الليل', 'الليل'),
 ('متاكده', 'متاكده'),
 ('تصبح', 'شاعر'),
 ('عندما', 'تكره'),
 ('بدوا', 'وصاروا'),
 ('المنزل', 'البيت'),
 ('مرضت', 'مرضت'),
 ('لماذا', 'ليش'),
 ('كلمات', 'كلمات'),
 ('جميله', 'حلوه'),
 ('كليب', 'فيديو'),
 ('جميل', 'حلو'),
 ('ويحتوي', 'استغربت'),
 ('تشجيع', 'استغربت'),
 ('لبس', 'كيس'),
 ('الحجاب', 'الحجاب'),
 

In [71]:
msa_wv_list=[]
lav_wv_list=[]
for i,j in word_pairs:
    if i in m_msa.wv.key_to_index and j in m_lav.wv.key_to_index :
        msa_wv_list.append(m_msa.wv.key_to_index[i])
        lav_wv_list.append(m_lav.wv.key_to_index[j])
    else:
        print(i,j)

: بيتمسخروا
20 20
« na3saa
» na3saa
362 362
للامثال 362
العديد na3saa
12 12
الحادي 11
na3saa na3saa
@ na3saa
hotmailcom na3saa
… اهل
اللاختيار 11
الوحده 1
تتخيلوا 700
700 700
ريال 700
الحيله 700


In [72]:
len(msa_wv_list)

6099

In [74]:
m_msa.wv.index_to_key[27],m_lav.wv.index_to_key[22]

('اعرف', 'بعرف')

In [75]:
msa_wv_list[0]

27

In [76]:
trn_msa_emb,val_msa_emb = m_msa.wv.vectors[msa_wv_list],m_msa.wv.vectors[msa_wv_list][5500:]
trn_lav_emb,val_lav_emb = m_lav.wv.vectors[lav_wv_list],m_lav.wv.vectors[lav_wv_list][5500:]

### Pre trained model

In [81]:
import gensim

In [82]:
t_model = gensim.models.Word2Vec.load('../models/full_grams_cbow_100_wiki/full_grams_cbow_100_wiki.mdl')

In [86]:
msa_wv_list=[]
lav_wv_list=[]
for i,j in word_pairs:
    if i in t_model.wv.key_to_index and j in t_model.wv.key_to_index :
        msa_wv_list.append(t_model.wv.key_to_index[i])
        lav_wv_list.append(t_model.wv.key_to_index[j])

In [92]:
#lav_wv_list

In [93]:
t_model.wv.index_to_key[20183],t_model.wv.index_to_key[100521]

('اعرف', 'بعرف')

In [94]:
c/len(word_pairs)

0.7103628636809415

In [99]:
trn_msa_emb = t_model.wv.vectors[msa_wv_list]
trn_lav_emb = t_model.wv.vectors[lav_wv_list]

We use Frobenius square loss

In [100]:
def compute_loss(X,Y,R):
    diff_squared = (torch.dot(X,R) - Y)**2
    loss = torch.sum(diff_squared) / X.shape[0]
    return loss

In [101]:
def compute_gradient(X, Y, R):
    m = X.shape[0]
    return np.dot(X.T,(np.dot(X,R)-Y)) * (2/m)

In [102]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    '''
    Inputs:
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        train_steps: positive int - describes how many steps will gradient descent algorithm do.
        learning_rate: positive float - describes how big steps will  gradient descent algorithm do.
    Outputs:
        R: a matrix of dimension (n,n) - the projection matrix that minimizes the F norm ||X R -Y||^2
    '''

    # the number of columns in X is the number of dimensions for a word vector (e.g. 300)
    # R is a square matrix with length equal to the number of dimensions in th  word embedding
    R = np.random.rand(X.shape[1], X.shape[1])

    for i in range(train_steps):
    #    if i % 25 == 0:
    #        print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
        # use the function that you defined to compute the gradient
        gradient = compute_gradient(X,Y,R)

        # update R by subtracting the learning rate times gradient
        R -= learning_rate * gradient
        ### END CODE HERE ###
    return R

In [103]:
for lr in [0.1,0.2,0.4,0.6,0.8,1,1.5,2]:
    for steps in [400,600,800,1000,1200]:
        R_train = align_embeddings(trn_msa_emb, trn_lav_emb, train_steps=steps, learning_rate=lr)
        #Testing
        X = np.dot(trn_msa_emb,R_train)
        sim_matrix = vectorized_cos_sim_cuda(X,trn_lav_emb)
        ret = np.argmax(sim_matrix,axis=1)
        c = (ret == range(len(trn_lav_emb))).sum()
        print(f'# correct examples for lr: {lr} and steps of {steps} = {c}')

# correct examples for lr: 0.1 and steps of 400 = 1




# correct examples for lr: 0.1 and steps of 600 = 1
# correct examples for lr: 0.1 and steps of 800 = 1
# correct examples for lr: 0.1 and steps of 1000 = 1
# correct examples for lr: 0.1 and steps of 1200 = 1
# correct examples for lr: 0.2 and steps of 400 = 1
# correct examples for lr: 0.2 and steps of 600 = 1
# correct examples for lr: 0.2 and steps of 800 = 1
# correct examples for lr: 0.2 and steps of 1000 = 1
# correct examples for lr: 0.2 and steps of 1200 = 1
# correct examples for lr: 0.4 and steps of 400 = 1
# correct examples for lr: 0.4 and steps of 600 = 1
# correct examples for lr: 0.4 and steps of 800 = 1
# correct examples for lr: 0.4 and steps of 1000 = 1
# correct examples for lr: 0.4 and steps of 1200 = 1
# correct examples for lr: 0.6 and steps of 400 = 1
# correct examples for lr: 0.6 and steps of 600 = 1
# correct examples for lr: 0.6 and steps of 800 = 1
# correct examples for lr: 0.6 and steps of 1000 = 1
# correct examples for lr: 0.6 and steps of 1200 = 1
# co

### Test

In [135]:
X = np.dot(trn_msa_emb,R_train)

In [136]:
X.shape

(6099, 300)

In [137]:
sim_matrix = vectorized_cos_sim_cuda(X,trn_lav_emb)
ret = np.argmax(sim_matrix,axis=1)

In [141]:
(ret == range(len(trn_lav_emb))).sum()

2

## Pytorch

In [104]:
import torch
import torch.nn as nn
import torch.nn.functional as F

Data

In [105]:
from torch.utils.data import Dataset,DataLoader

In [106]:
class arrDs(Dataset):
    def __init__(self,X,Y):
        self.data=torch.from_numpy(X)
        self.labels=torch.from_numpy(Y)
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        return torch.stack([self.data[idx],self.labels[idx]])

Loss

In [110]:
class frob(nn.Module):    
    def __call__(self,X,Y):
        m = X.shape[0]
        # diff is XR - Y
        diff = X - Y
        # diff_squared is the element-wise square of the difference
        diff_squared = diff**2
        # sum_diff_squared is the sum of the squared elements
        sum_diff_squared = torch.sum(diff_squared)
        # loss i the sum_diff_squard divided by the number of examples (m)
        loss = sum_diff_squared / m
        return loss

In [111]:
frob_loss=frob()

In [112]:
mse_loss = F.mse_loss

Model

In [113]:
class simple(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Sequential(nn.Linear(100,100,bias=True),
                                 nn.ReLU(),
                                 nn.Linear(100,100,bias=True),
                                 nn.ReLU(),
                                 nn.Linear(100,100,bias=True))
    def forward(self,x):
        return self.lin(x)

In [114]:
model = simple()

Optim

In [115]:
optim = torch.optim.SGD(model.parameters(), lr=0.2)

In [116]:
#sced = torch.optim.lr_scheduler.StepLR(optim,200,gamma=0.8)

Train

In [117]:
for i in range(800):
    model.train()
    optim.zero_grad()
    data = torch.from_numpy(trn_msa_emb)
    label = torch.from_numpy(trn_lav_emb)
    op = model(data)
    loss = mse_loss(op,label)
    loss.backward()
    optim.step()
    if i%25==0:
        print(loss)
        with torch.no_grad():
            model.eval()
            X = model(torch.from_numpy(trn_msa_emb))
            sim_matrix = vectorized_cos_sim_cuda(X.detach().numpy(),trn_lav_emb)
            ret = np.argmax(sim_matrix,axis=1)
            print((ret == range(len(trn_lav_emb))).sum())

tensor(3.1418, grad_fn=<MseLossBackward>)
0
tensor(2.9455, grad_fn=<MseLossBackward>)
4
tensor(2.8687, grad_fn=<MseLossBackward>)
8
tensor(2.7941, grad_fn=<MseLossBackward>)
27
tensor(2.7096, grad_fn=<MseLossBackward>)
101
tensor(2.6280, grad_fn=<MseLossBackward>)
234
tensor(2.5588, grad_fn=<MseLossBackward>)
386
tensor(2.5015, grad_fn=<MseLossBackward>)
556
tensor(2.4529, grad_fn=<MseLossBackward>)
689
tensor(2.4107, grad_fn=<MseLossBackward>)
821
tensor(2.3737, grad_fn=<MseLossBackward>)
933
tensor(2.3412, grad_fn=<MseLossBackward>)
1014
tensor(2.3126, grad_fn=<MseLossBackward>)
1064
tensor(2.2870, grad_fn=<MseLossBackward>)
1117
tensor(2.2641, grad_fn=<MseLossBackward>)
1158
tensor(2.2435, grad_fn=<MseLossBackward>)
1186
tensor(2.2248, grad_fn=<MseLossBackward>)
1214
tensor(2.2076, grad_fn=<MseLossBackward>)
1235
tensor(2.1917, grad_fn=<MseLossBackward>)
1256
tensor(2.1776, grad_fn=<MseLossBackward>)
1265
tensor(2.2024, grad_fn=<MseLossBackward>)
1273
tensor(2.1523, grad_fn=<MseLoss

Evaluate

In [118]:
X = model(torch.from_numpy(trn_msa_emb))

In [119]:
sim_matrix = vectorized_cos_sim_cuda(X.detach().numpy(),trn_lav_emb)
ret = np.argmax(sim_matrix,axis=1)

In [120]:
(ret == range(len(trn_lav_emb))).sum()

1324

## Manual Torch

In [122]:
data = torch.from_numpy(trn_msa_emb)
label = torch.from_numpy(trn_lav_emb)
w1 = torch.rand(100,100,requires_grad=True)
lr= 1e-2
for i in range(600):
    op = torch.matmul(data,w1)
    loss = frob_loss(op,label)
    loss.backward()
    if i%50==0:
        print(loss)
    with torch.no_grad():
        w1 -=  lr*w1.grad
    w1.requires_grad = True

tensor(9592.1963, grad_fn=<DivBackward0>)
tensor(5323.6069, grad_fn=<DivBackward0>)
tensor(5444.4238, grad_fn=<DivBackward0>)
tensor(4764.9780, grad_fn=<DivBackward0>)
tensor(5485.4663, grad_fn=<DivBackward0>)
tensor(6249.7515, grad_fn=<DivBackward0>)
tensor(4756.2710, grad_fn=<DivBackward0>)
tensor(5128.9648, grad_fn=<DivBackward0>)
tensor(5056.0591, grad_fn=<DivBackward0>)
tensor(5009.1401, grad_fn=<DivBackward0>)
tensor(5340.9111, grad_fn=<DivBackward0>)
tensor(5592.8408, grad_fn=<DivBackward0>)


Evaluate

In [123]:
X = torch.matmul(data,w1) 

In [124]:
sim_matrix = vectorized_cos_sim_cuda(X.detach().numpy(),label.detach().numpy())
ret = np.argmax(sim_matrix,axis=1)

In [125]:
(ret == range(len(trn_lav_emb))).sum()

69

In [126]:
X

tensor([[-6.1029, -7.9874, -6.5976,  ...,  1.9660, -5.4957, -4.1504],
        [13.0289, 10.8297,  4.4311,  ...,  5.7603,  0.1946, -4.6342],
        [-3.4615,  1.5145,  0.0152,  ..., -9.1015, -3.6867, -6.0899],
        ...,
        [-0.5324, -0.2738, -0.4118,  ..., -0.0648, -0.9023, -0.4321],
        [-0.7898, -1.1300, -0.3995,  ..., -0.7782, -0.9758, -0.0220],
        [-0.5042, -1.3221,  4.0849,  ..., -5.8279, -2.3902, -7.8763]],
       grad_fn=<MmBackward>)

In [110]:
ret[10]

150

Rest

In [132]:
def nearest_neighbor(v, candidates, k=1):

    # for each candidate vector...  
    similarity_l = vectorized_cos_sim(v,candidates)
  
    # sort the similarity list and get the indices of the sorted list
    sorted_ids = np.argsort(similarity_l).squeeze()

    # get the indices of the k most similar candidate vectors
    k_idx = sorted_ids[-k:]
    ### END CODE HERE ###
    return k_idx

In [133]:
def test_vocabulary(X, Y, R):
    '''
    Input:
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the transform matrix which translates word embeddings from
        English to French word vector space.
    Output:
        accuracy: for the English to French capitals
    '''

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # The prediction is X times R
    pred = np.dot(X,R)

    # initialize the number correct to zero
    num_correct = 0

    # loop through each row in pred (each transformed embedding)
    for i in range(len(pred)):
        # get the index of the nearest neighbor of pred at row 'i'; also pass in the candidates in Y
        pred_idx = nearest_neighbor(pred[i],Y,k=1)

        # if the index of the nearest neighbor equals the row of i... \
        if pred_idx == i:
            # increment the number correct by 1.
            num_correct += 1

    # accuracy is the number correct divided by the number of rows in 'pred' (also number of rows in X)
    accuracy = num_correct/X.shape[0]

    ### END CODE HERE ###

    return accuracy

In [134]:
acc = test_vocabulary(trn_msa_emb,trn_lav_emb, R_train)

KeyboardInterrupt: 

In [None]:
acc