In [1]:
# !pip install --upgrade pip
# !pip install gensim
# !pip install nltk
# !pip install tokenizers
# !pip install sentencepiece
# !pip install python-bidi
# !pip install arabic-reshaper
# !pip install PyArabic

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
import pandas as pd
# from sklearn.metrics import accuracy_score
import numpy as np
from utilities import *
import textProcessing as tp
import nltk
import data_preprocessing as dp
from torch.optim.lr_scheduler import StepLR
# # nltk.download('punkt')
# train_text = load_text("dataset/train.txt")
# tp.preprocessing_text(train_text,"train_preprocessed.txt")


DIACRITIC2INDEX:  dict_items([('ً', 0), ('ٌ', 1), ('ٍ', 2), ('َ', 3), ('ُ', 4), ('ِ', 5), ('ّ', 6), ('ًّ', 7), ('ٌّ', 8), ('ٍّ', 9), ('َّ', 10), ('ُّ', 11), ('ِّ', 12), ('ْ', 13), ('', 14), ('0', 15)])
{'ً': 0, 'ٌ': 1, 'ٍ': 2, 'َ': 3, 'ُ': 4, 'ِ': 5, 'ّ': 6, 'ًّ': 7, 'ٌّ': 8, 'ٍّ': 9, 'َّ': 10, 'ُّ': 11, 'ِّ': 12, 'ْ': 13, '': 14, '0': 15}
16
{'ء': 0, 'آ': 1, 'أ': 2, 'ؤ': 3, 'إ': 4, 'ئ': 5, 'ا': 6, 'ب': 7, 'ة': 8, 'ت': 9, 'ث': 10, 'ج': 11, 'ح': 12, 'خ': 13, 'د': 14, 'ذ': 15, 'ر': 16, 'ز': 17, 'س': 18, 'ش': 19, 'ص': 20, 'ض': 21, 'ط': 22, 'ظ': 23, 'ع': 24, 'غ': 25, 'ف': 26, 'ق': 27, 'ك': 28, 'ل': 29, 'م': 30, 'ن': 31, 'ه': 32, 'و': 33, 'ى': 34, 'ي': 35, ' ': 36, '0': 37, 's': 38}
39
قوله أو قطع الأول يده إلخ قال الزركشي
sentence قَوْلُهُ أَوْ قَطَعَ الأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ
reference قَوْلُهُ أَوْ قَطَعَ الأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ


In [3]:
class MyDataset(Dataset):
    def __init__(self):
        self.data = load_text("dataset/train_preprocessed.txt")
        self.transform = ToTensor()
        self.T = 300

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        sentence = self.data[idx]
        # extract the label
        labels,sentence= tp.extract_diacritics_with_previous_letter('s'+sentence)

        # check sentence length
        if len(sentence) > self.T:
            sentence = sentence[:self.T]
            labels = labels[:self.T]
        else:
            for i in range(self.T - len(sentence)):
                sentence += '0'
                labels.append(['0','0'])
        
        assert len(sentence) == self.T
        assert len(labels) == self.T

        # convert the sentence to one hot encoding
        sentence = convert_sentence_to_vector(sentence)

        # convert the labels to one hot encoding
        labels = convert_labels_to_vector(labels)

        # pad the sentence and labels if smaller than T
        temp = np.zeros((self.T, 39))
        if self.T > sentence.shape[0]:
            temp[:sentence.shape[0], :] = sentence
            sentence = temp
        else:
            sentence = sentence[:self.T, :]
        

        # reshape the labels
            
        labels = labels.reshape(-1,1)
        # temp = np.zeros((self.T,1))
        # if self.T > labels.shape[0]:
        #     temp[:labels.shape[0], :] = labels
        #     labels = temp
        # else:
        #     labels = labels[:self.T]
        #     # labels[self.T:, 14] = 1 

        # convert the sentence and labels to tensors
        sentence = torch.tensor(sentence, dtype=torch.float32)
        labels = torch.LongTensor(labels)

        return sentence, labels


In [4]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = 2
        # define the embedding layer
        # self.embedding = nn.Embedding(input_size, hidden_size)
        # self.rnn = nn.RNN(hidden_size, hidden_size,num_layers=self.num_layers, batch_first=True)

        self.rnn = nn.RNN(self.input_size, hidden_size,num_layers=self.num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_size*2, 15)
        self.init_weight()
        # (n,T,15) output, labels (n,T)
    def forward(self, input):
        # input = self.embedding(input)
        # print(input.shape)
        output, _ = self.rnn(input)
        output = self.linear(output)
        return output

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers,batch_size, self.hidden_size)
    def init_weight(self):
        for name, param in self.named_parameters():
            if 'bias' in name:
                nn.init.constant(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal(param)

In [5]:
class LSTMClassifier(nn.Module):
    def _init_(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self)._init_()


        self.hidden_size = hidden_size
        self.input_size = input_size

        self.lstm = nn.LSTM(input_size, hidden_size, 5,batch_first=True)
        self.tanh = nn.Tanh()
        self.linear = nn.Linear(hidden_size, output_size)
        self.init_weight()

    def forward(self, input, hidden, c):
        output, _ = self.lstm(input, (hidden,c))
        # reshape the output to be able to pass it to the linear layer
        # output = output.contiguous().view(-1, self.hidden_size)
        output = self.tanh(output)
        output = self.linear(output)
        # output = self.soft(output)
        return output

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)
    def init_c(self, batch_size):
        return torch.zeros(1,batch_size, self.hidden_size)
    def init_weight(self):
        for name, param in self.named_parameters():
            if 'bias' in name:
                nn.init.constant(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal(param)

In [6]:
# Hyperparameter
input_size = 39
hidden_size = 128
output_size = 15
batch_size = 512
num_epochs = 10

In [7]:
# connect to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.is_available())
print(torch.cuda.device_count())

# Create an instance of the RNN classifier
model = RNNClassifier(input_size, hidden_size, output_size)
model.to(device)

# Load data from CSV file
dataset = MyDataset()

# Create a dataloader to handle batching and shuffling
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Define loss function and optimizer
# criterion = nn.CrossEntropyLoss(ignore_index=0)

criterion = nn.CrossEntropyLoss(ignore_index=15)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

cuda:0
True
1


  nn.init.xavier_normal(param)
  nn.init.constant(param, 0.0)


In [10]:
# model.load_state_dict(torch.load('model1.pth'))
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
debug = True
for epoch in range(num_epochs):
    scheduler.step()
    for inputs, labels in train_dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        input = inputs.view(inputs.shape[0], -1, input_size)
        input, labels = input.to(device), labels.to(device)

        # Forward pass
        output = model(input)
        output = output.view(-1,15)
        labels = labels.view(-1)

        loss = criterion(output, labels)

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        # Print loss for monitoring
        print(f"Epoch: {epoch+1}, Batch Loss: {loss.item()}")
        # break




Epoch: 1, Batch Loss: 0.42573869228363037
Epoch: 1, Batch Loss: 0.6393350958824158
Epoch: 1, Batch Loss: 0.4733433127403259
Epoch: 1, Batch Loss: 0.4819697141647339
Epoch: 1, Batch Loss: 0.4980274736881256
Epoch: 1, Batch Loss: 0.47648102045059204
Epoch: 1, Batch Loss: 0.47995302081108093
Epoch: 1, Batch Loss: 0.4537152945995331
Epoch: 1, Batch Loss: 0.4711759686470032
Epoch: 1, Batch Loss: 0.45292484760284424
Epoch: 1, Batch Loss: 0.4510199725627899
Epoch: 1, Batch Loss: 0.45789119601249695
Epoch: 1, Batch Loss: 0.4475744962692261
Epoch: 1, Batch Loss: 0.4626626968383789
Epoch: 1, Batch Loss: 0.4470590054988861
Epoch: 1, Batch Loss: 0.4388487637042999
Epoch: 1, Batch Loss: 0.4568155109882355
Epoch: 1, Batch Loss: 0.4423525929450989
Epoch: 1, Batch Loss: 0.42561468482017517
Epoch: 1, Batch Loss: 0.42810267210006714
Epoch: 1, Batch Loss: 0.4238121807575226
Epoch: 1, Batch Loss: 0.4280634820461273
Epoch: 1, Batch Loss: 0.43570396304130554
Epoch: 1, Batch Loss: 0.43989402055740356
Epoch: 

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'model_1.pth')

In [11]:
# Set the model to evaluation mode
# model.load_state_dict(torch.load('model1.pth'))
model.eval()

correct = 0
correct_prediction = 0
total_predictions = 0
total = 0

with torch.no_grad():
  for inputs, labels in train_dataloader:
      # Forward pass
      hidden = model.init_hidden(batch_size=inputs.shape[0])
      hidden = hidden.to(device)
      inputs ,labels = inputs.to(device),labels.to(device)

      outputs = model(inputs)
      outputs = outputs.view(-1, output_size)
      labels = labels.view(-1)

      # get predictions in each time step as index
      _, predicted = torch.max(outputs.data, 1)

      # cut the padding 
      predicted = predicted[labels != 15]
      labels = labels[labels != 15]
      
      # Compute accuracy
      correct += (predicted == labels).sum().item()
      total += labels.size(0) 
      print(f"Accuracy: {correct/total}")
      


print(f"Accuracy: {correct/total}")

Accuracy: 0.8640772346180862
Accuracy: 0.862483589927199
Accuracy: 0.8617083523414721
Accuracy: 0.8622044875025386
Accuracy: 0.8614400228773648
Accuracy: 0.861373356721919
Accuracy: 0.8609851185829954
Accuracy: 0.861200781904496
Accuracy: 0.8608187424565581
Accuracy: 0.8609902733023034
Accuracy: 0.8612244953494321
Accuracy: 0.8611960400955839
Accuracy: 0.8613701831993462
Accuracy: 0.8609581656507862


KeyboardInterrupt: 

In [None]:
# clear the memory of the GPU
import gc
torch.cuda.empty_cache()
gc.collect()

717