In [2]:
from pytest import importorskip
from train_collections import DS_HARAKAT , DS_ARABIC_LETTERS
from text_encoder import TextEncoder
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [3]:

    

output_encoder = TextEncoder(DS_HARAKAT)
input_encoder = TextEncoder(DS_ARABIC_LETTERS)

c = input_encoder.encode("السلام عليكم")
input_encoder.decode(c)
device = torch.device('cpu')


In [4]:
import numpy as np


input_file = 'clean_out/X.csv'
output_file = 'clean_out/Y.csv'
input_val = 'clean_out/X_val.csv'
output_val = 'clean_out/Y_val.csv'
# input is a sequence of letters and output is a sequence of harakat

def read_data(input_file, output_file):
    X = []
    Y = []
    # read csv files 
    with open(input_file, 'r') as f:
        X = f.readlines()
    with open(output_file, 'r') as f:
        Y = f.readlines()
        
    # remove the \n from the end of each line
    X = [x.strip().split(',') for x in X]
    Y = [y.strip().split(',') for y in Y]
    return X, Y


x,y = read_data(input_file, output_file)
x_val, y_val = read_data(input_val, output_val)
print("x: ", x[0])
print("y: ", y[0])
encoded_x = [ input_encoder.encode(x[i]) for i in range(len(x)) ]
encoded_y = [ output_encoder.encode(y[i]) for i in range(len(y)) ]
encoded_x_val = [ input_encoder.encode(x_val[i]) for i in range(len(x_val)) ]
encoded_y_val = [ output_encoder.encode(y_val[i]) for i in range(len(y_val)) ]

max_len = max([len(x) for x in encoded_x])
min_len = min([len(x) for x in encoded_x])
print("max_len: ", max_len)
print("min_len: ", min_len)



def _paddata(data, max_len):
    for i in range(len(data)):
        data[i] = np.pad(data[i], (0, max_len - len(data[i])), 'constant', constant_values=(-1,-1))
    return data

encoded_x = _paddata(encoded_x, max_len)
encoded_y = _paddata(encoded_y, max_len)
encoded_x_val = _paddata(encoded_x_val, max_len)
encoded_y_val = _paddata(encoded_y_val, max_len)





x:  ['ق', 'و', 'ل', 'ه', ' ', ':', ' ', '(', ' ', 'أ', 'و', ' ', 'ق', 'ط', 'ع', ' ', 'ا', 'ل', 'أ', 'و', 'ل', ' ', 'ي', 'د', 'ه', ' ', 'إ', 'ل', 'خ', ' ', ')', ' ', 'ق', 'ا', 'ل', ' ', 'ا', 'ل', 'ز', 'ر', 'ك', 'ش', 'ي', '(', ' ', '/', ' ', ')']
y:  ['َ', 'ْ', 'ُ', 'ُ', '$', '$', '$', '$', '$', 'َ', 'ْ', '$', 'َ', 'َ', 'َ', '$', '$', 'ْ', 'َ', 'َّ', 'ُ', '$', 'َ', 'َ', 'ُ', '$', '$', 'َ', 'ْ', '$', '$', '$', 'َ', '$', 'َ', '$', '$', '$', 'َّ', 'ْ', 'َ', 'ِ', 'ُّ', '$', '$', '$', '$', '$']
max_len:  7581
min_len:  3


In [4]:
max_len = max([len(x) for x in encoded_x])
min_len = min([len(x) for x in encoded_x])
print("max_len: ", max_len)
print("min_len: ", min_len)


max_len:  7095
min_len:  7095


In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.tensor(encoded_x), torch.tensor(encoded_y))
valid_data = TensorDataset(torch.tensor(encoded_x_val), torch.tensor(encoded_y_val))


# dataloaders
batch_size = 50


# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)


# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)


print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample output: \n', sample_y)




  train_data = TensorDataset(torch.tensor(encoded_x), torch.tensor(encoded_y))


Sample input size:  torch.Size([50, 7095])
Sample input: 
 tensor([[23, 13,  9,  ..., -1, -1, -1],
        [26, 22,  0,  ..., -1, -1, -1],
        [26, 32, 24,  ..., -1, -1, -1],
        ...,
        [19,  9, 17,  ..., -1, -1, -1],
        [31,  9, 31,  ..., -1, -1, -1],
        [26, 20,  7,  ..., -1, -1, -1]], dtype=torch.int32)
Sample output: 
 tensor([[ 2,  3,  1,  ..., -1, -1, -1],
        [ 0,  0, 14,  ..., -1, -1, -1],
        [ 0,  2,  3,  ..., -1, -1, -1],
        ...,
        [ 0,  3,  5,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  3,  ..., -1, -1, -1]], dtype=torch.int32)


In [6]:

output_dim = len(DS_HARAKAT)

class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
   
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
       
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
       
        # dropout layer
        self.dropout = nn.Dropout(0.3)
   
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sfmax = nn.Softmax()
       
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
       
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
       
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
       
        # sigmoid function
        sig_out = self.sfmax(out)
       
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)


        sig_out = sig_out[:, -1] # get last batch of labels
       
        # return last sigmoid output and hidden state
        return sig_out, hidden
       
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        hidden = (h0,c0)
        return hidden 

In [7]:
no_layers = 2
vocab_size = len(DS_ARABIC_LETTERS) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)


#moving to gpu
# model.to(device)
print(model)

SentimentRNN(
  (embedding): Embedding(38, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sfmax): Softmax(dim=None)
)


In [8]:
lr=0.001


criterion = nn.BCELoss()


optimizer = torch.optim.Adam(model.parameters(), lr=lr)


# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [9]:
from torch.nn.utils import clip_grad_norm_
clip = 5
epochs = 5
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]


for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
       
        inputs, labels = inputs.to(device), labels.to(device)  
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
       
        model.zero_grad()
        output,h = model(inputs,h)
       
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
       
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])


            inputs, labels = inputs.to(device), labels.to(device)


            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())


            val_losses.append(val_loss.item())
           
            accuracy = acc(output,labels)
            val_acc += accuracy
           
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(25*'==')

IndexError: index out of range in self

In [None]:

from torch.utils.data import Dataset, DataLoader
import torch

class MyDataset(Dataset):
    def __init__(self, x, y, input_encoder, output_encoder):
        self.x = x
        self.y = y
        self.input_encoder = input_encoder
        self.output_encoder = output_encoder
        self.fixed_len = np.max([len(x) for x in self.x])
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        
        # pad the sequence to fixed length
        x = x + ['P'] * (self.fixed_len - len(x))
        y = y + ['P'] * (self.fixed_len - len(y))
        
        
        x = self.input_encoder.encode(x)
        y = self.output_encoder.encode(y)
        
        return x, y

dataset = MyDataset(x, y, input_encoder, output_encoder)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

for x, y in dataloader:
    print(x)
    print(y)
    break

[tensor([20, 26,  5, 20, 20, 26, 27, 26, 26, 19]), tensor([26, 21,  7, 26, 26,  7,  4, 32, 23,  9]), tensor([22,  8,  3, 22, 22,  6, 23, 24, 24, 17]), tensor([25,  0, 24, 25, 25, 26, 17, 36, 25, 36]), tensor([36, 36,  0, 36, 36, 22, 36, 20,  0, 32]), tensor([ 2, 22, 36, 26, 26, 25,  9, 22, 36,  8]), tensor([17,  0, 27, 31,  0, 36, 33, 24, 31,  0]), tensor([ 0, 36,  5, 20, 22, 27, 26,  0, 23, 36]), tensor([22, 27, 27,  9, 20, 26, 11, 36,  9,  3]), tensor([28, 23, 28, 36, 26, 23, 36, 27, 25,  1]), tensor([36, 21, 36, 21, 22, 36, 31, 19, 36,  2]), tensor([ 8, 24, 17,  0, 36,  6, 13,  9, 13, 36]), tensor([22, 25, 24, 19,  1, 23,  0, 20, 22,  8]), tensor([21, 36, 36,  9, 31, 27,  1, 36, 28, 22]), tensor([37, 20, 12, 36, 24, 11, 17,  1, 36, 21]), tensor([37,  9, 17, 31, 25, 36, 25, 27,  0, 36]), tensor([37,  1,  1, 27, 36, 31, 36, 24, 22, 19]), tensor([37,  0, 29, 36, 31, 26, 26, 25, 22, 22]), tensor([37, 24, 36,  1,  9, 36, 27, 23, 25,  0]), tensor([37, 36,  5, 21,  1,  0,  5,  0, 36, 36]),

In [None]:
# Training the model 
# we will use a simple RNN model with 2 layers
# the input is a sequence of letters and the output is a sequence of harakat

    



