In [1]:
import torch
import pandas as pd
import numpy as np
import re
import gensim
import collections
import pyarabic.araby as araby



# Data
### Helping Functions

In [2]:
def normalization(t):
    t = araby.strip_tashkeel(t)
    t = araby.normalize_hamza(t)
    t = araby.normalize_alef(t)
    t = araby.strip_tatweel(t)
    t = araby.normalize_teh(t)
    t = re.sub("ى","ي",t)
    return t

### Data

In [3]:
data= np.load('../translation project/AD_NMT-master/LAV-MSA-2-both.pkl',allow_pickle=True)

In [4]:
data[0] # lav , msa

['لا انا بعرف وحدة راحت ع فرنسا و معا شنتا حطت فيها الفرش',
 'لا اعرف واحدة ذهبت الى فرنسا و لها غرفة و ضعت فيها الافرشة']

In [5]:
# extract only msa text
msa=[]
for i,ex in enumerate(data):
    msa_text = normalization(ex[1])
    data[i][1] = msa_text
    msa.append(msa_text)

In [6]:
msa = ' '.join(msa)

Dictionaries

In [7]:
msa_d=collections.Counter(msa.split())

In [8]:
min_count = 2

In [9]:
idx2msa = np.array([word for word,freq in msa_d.items() if freq > min_count ])

In [10]:
msa2idx = {word:i for i,word in enumerate(idx2msa)}

In [11]:
msa_data = [' '.join([i for i in t[1].split() if (msa2idx.get(i,-1) != -1 and t[1] != '')]) for t in data]

In [12]:
msa_data = [i.replace('','') for i in msa_data if i != '']

Load Embeddings

In [13]:
t_model = gensim.models.Word2Vec.load('../resources/models/word vectors/word2vec/wiki/full_grams_cbow_100_wiki/full_grams_cbow_100_wiki.mdl')

In [53]:
i2l = list(set(normalization(araby.LETTERS)))
i2v = {}
for index,letter in enumerate(i2l):
    if letter in t_model.wv.index_to_key :
        i2v[index] = t_model.wv.get_vector(letter)

In [54]:
i2v.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28])

In [55]:
i2l.append(' ')
i2l.append('')
i2l.append('X')

In [56]:
l2i = {v:i for i,v in enumerate(i2l)}

Deep Learning

In [58]:
from torch.utils.data import Dataset,DataLoader

In [59]:
from torch.nn.utils.rnn import pad_sequence 

In [60]:
def noise(txt):
    noise_sz = np.random.randint(0,len(txt),1)
    replace_idx = np.random.choice(len(txt),noise_sz,replace=False)
    letters_idx = np.random.choice(len(i2l),noise_sz,replace=True)
    txt = list(txt)
    for rep,let in zip(replace_idx,letters_idx):
        txt[rep] = i2l[let]
    return ''.join(txt)

In [61]:
class arrDs(Dataset):
    def __init__(self,txt_list,l2i):
        self.data = txt_list
        self.l2i = l2i
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        X = noise(self.data[idx])
        Y = self.data[idx]
        
        X = torch.tensor([self.l2i.get(i,31) for i in X])
        Y = torch.tensor([self.l2i.get(i,31) for i in Y])
        #numerilize
        return (X,Y)

In [267]:
trn_data, val_data = msa_data[:int(0.8*len(msa_data))],msa_data[int(0.8*len(msa_data)):]

In [270]:
trn_ds,val_ds = arrDs(trn_data,l2i),arrDs(val_data,l2i)

In [275]:
def collate_fn(data):
    label  = [i for _,i in data]
    label = pad_sequence(label,batch_first=True)
    data = [i for i,_ in data]
    data = pad_sequence(data,batch_first=True)
    return data,label

In [286]:
trn_dl = DataLoader(trn_ds,batch_size=4,collate_fn=collate_fn)
val_dl = DataLoader(val_ds,batch_size=4,collate_fn=collate_fn)

Model

In [287]:
import torch, torch.nn as nn

In [326]:
class autocorrect(nn.Module):
    def __init__(self,num_emb,vs,hs,bidirectional=True):
        super().__init__()
        self.emb = nn.Embedding(num_emb,vs)
        self.gru = nn.GRU(vs,hs,num_layers=2,bidirectional=bidirectional,batch_first=True)
        self.lin = nn.Linear(2*hs if bidirectional == True else hs,num_emb)
    def forward(self,x):
        x = self.emb(x)
        x,_ = self.gru(x)
        x = nn.functional.relu(x)
        x = self.lin(x)
        return torch.softmax(x,dim=-1)

In [327]:
num_emb = len(i2l)

In [328]:
model = autocorrect(num_emb,100,128)

In [329]:
#Load available vectors
model.emb.weight.requires_grad_(False)
for i in i2v.keys():
    model.emb.weight[i] = nn.Parameter(torch.from_numpy(i2v[i])).requires_grad_(False)
model.emb.weight.requires_grad_(True)

Parameter containing:
tensor([[-0.7845,  0.8250,  0.5404,  ..., -3.3356,  1.9303,  0.4786],
        [ 1.6764,  1.5715, -0.1923,  ..., -0.5650, -1.1535, -0.9867],
        [ 0.1313, -0.5060,  0.1031,  ..., -1.2080,  0.4892,  0.5918],
        ...,
        [-3.2594,  0.5910,  0.3052,  ...,  0.6713,  0.2578,  0.5516],
        [-0.4394, -0.1234,  1.4807,  ..., -1.5268,  0.1156, -1.2336],
        [ 0.1978, -0.7140, -1.2850,  ..., -0.2416,  1.7916, -0.8613]],
       requires_grad=True)

In [330]:
opt = torch.optim.Adam(model.parameters(),lr=1e-4)

In [331]:
loss = nn.CrossEntropyLoss()

In [332]:
def train(epoch,model,trn_dl,val_dl,loss_fnc):
    model.train()
    for i in range(epoch):
        for batch in trn_dl:
            opt.zero_grad()
            ip,label = batch
            op = model(ip)
            trn_l = loss_fnc(op,label)
            trn_l.backward()
            opt.step()
        
        model.eval()
        for batch in val_dl:
            ip,label = batch
            op = model(ip)
            val_loss = loss_fnc(op,label)
        print('train_ loss ->',trn_l , 'val_loss ->',val_loss)

In [333]:
train(5,model,trn_dl,val_dl,loss)

RuntimeError: Expected target size [4, 32], got [4, 46]

In [347]:
a = torch.softmax(torch.rand(5,10,2),dim=-1)

In [348]:
nn.functional.cross_entropy(a,torch.randint(0,2,(5,10,)))

RuntimeError: Expected target size [5, 2], got [5, 10]