In [7]:
import torch
import pandas as pd
import numpy as np
import re
import gensim
import collections
import pyarabic.araby as araby

# Data
### Helping Functions

In [30]:
def normalization(t):
    t = araby.strip_tashkeel(t)
    t = araby.normalize_hamza(t)
    t = araby.normalize_alef(t)
    t = araby.strip_tatweel(t)
    t = araby.normalize_teh(t)
    t = re.sub("ى","ي",t)
    return t

### Data

In [31]:
data= np.load('../translation project/AD_NMT-master/LAV-MSA-2-both.pkl',allow_pickle=True)

In [32]:
data[0] # lav , msa

['لا انا بعرف وحدة راحت ع فرنسا و معا شنتا حطت فيها الفرش',
 'لا اعرف واحدة ذهبت الى فرنسا و لها غرفة و ضعت فيها الافرشة']

In [33]:
# extract only msa text
msa=[]
for i,ex in enumerate(data):
    msa_text = normalization(ex[1])
    data[i][1] = msa_text
    msa.append(msa_text)

In [34]:
msa = ' '.join(msa)

Dictionaries

In [35]:
msa_d=collections.Counter(msa.split())

In [36]:
min_count = 2

In [37]:
idx2msa = np.array([word for word,freq in msa_d.items() if freq > min_count ])

In [38]:
msa2idx = {word:i for i,word in enumerate(idx2msa)}

In [39]:
msa_data = [' '.join([i for i in t[1].split() if (msa2idx.get(i,-1) != -1 and t[1] != '')]) for t in data]

In [40]:
msa_data = [i.replace('','') for i in msa_data if i != '']

Load Embeddings

In [41]:
t_model = gensim.models.Word2Vec.load('../resources/models/word vectors/word2vec/wiki/full_grams_cbow_100_wiki/full_grams_cbow_100_wiki.mdl')

In [88]:
i2l = list(set(normalization(araby.LETTERS)))
i2v = {}
for letter in i2l:
    if letter in t_model.wv.index_to_key :
        i2v[i] = t_model.wv.get_vector(letter)

In [89]:
i2l.append(' ')
i2l.append('')
i2l.append('X')

In [90]:
l2i = {v:i for i,v in enumerate(i2l)}

Deep Learning

In [91]:
from torch.utils.data import Dataset,DataLoader

In [92]:
from torch.nn.utils.rnn import pad_sequence 

In [170]:
def noise(txt):
    noise_sz = np.random.randint(0,len(txt),1)
    replace_idx = np.random.choice(len(txt),noise_sz,replace=False)
    letters_idx = np.random.choice(len(i2l),noise_sz,replace=True)
    txt = list(txt)
    for rep,let in zip(replace_idx,letters_idx):
        txt[rep] = i2l[let]
    return ''.join(txt)

In [171]:
class arrDs(Dataset):
    def __init__(self,txt_list,l2i):
        self.data = txt_list
        self.l2i = l2i
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        X = noise(self.data[idx])
        Y = self.data[idx]
        
        X = torch.tensor([self.l2i.get(i,31) for i in X])
        Y = torch.tensor([self.l2i.get(i,31) for i in Y])
        #numerilize
        return (X,Y)

In [172]:
ds = arrDs(msa_data,l2i)

In [173]:
ds[0]

(tensor([23, 11,  8,  8, 19, 11,  8, 19,  8,  6,  5, 18, 24, 22, 24, 31,  8, 22,
         22, 17,  3, 29,  5, 20,  5,  7, 14,  9, 17, 25,  6,  4, 26, 27, 20,  4,
         17, 22, 22, 17, 28, 29, 27,  8, 26]),
 tensor([23, 19, 29, 19, 28,  0, 21, 29, 13, 19, 26,  6, 17, 29, 24, 17, 12, 22,
         29, 19, 23, 19, 29, 21,  0,  5,  9, 19, 29, 13, 29, 23, 17, 19, 29, 16,
          0, 21, 17, 29, 13, 29, 21,  2, 17, 19]))

In [174]:
class catcher:
    def __init__(self):
        self.catch=None

In [175]:
catch = catcher()

In [176]:
def collate_fn(data):
    label  = [i for _,i in data]
    data = [i for i,_ in data]
    #data = [torch.tensor(i).clone().detach() for i in data]
    print(len(data),len(data[0]))
    data = pad_sequence(data,batch_first=True)
    print('shape',data.shape)
    return data

In [177]:
dl = DataLoader(ds,batch_size=4,collate_fn=collate_fn)

Model

In [179]:
import torch, torch.nn as nn

In [191]:
class autocorrect(nn.Module):
    def __init__(self,num_emb,vs,hs):
        super().__init__()
        self.emb = nn.Embedding(num_emb,vs)
        self.gru = nn.GRU(vs,hs,num_layers=2,bidirectional=True,batch_first=True)
        self.lin = nn.Linear(hs,num_emb)
        self.seq = nn.Sequential(self.emb,
                                 self.gru,
                                 nn.ReLU(),
                                 self.lin,
                                 nn.Softmax())
    def forward(self,x):
        return self.seq(x)

In [192]:
num_emb = len(l2i)

In [194]:
model = autocorrect(num_emb,100,128)

In [200]:
for i in model.emb.parameters():
    i = torch.zeros_like(i)

In [202]:
?model.emb.from_pretrained

In [201]:
for i in model.emb.parameters():
    print(i)

Parameter containing:
tensor([[-1.7467, -1.1606, -1.1070,  ...,  0.4234, -0.6524,  1.8570],
        [ 1.6554,  1.5280, -1.9122,  ...,  1.0085,  0.1442, -0.7672],
        [-0.7371,  0.0341, -0.5613,  ...,  0.2370,  1.6372, -1.5546],
        ...,
        [ 0.0218, -0.4320,  0.4238,  ..., -0.6130,  0.2996, -0.5980],
        [ 0.0895, -1.0804,  1.4918,  ...,  1.1884, -1.3827, -1.2079],
        [ 0.9144,  1.5032,  0.5403,  ...,  0.0819, -1.2501, -1.5617]],
       requires_grad=True)
