In [5]:
import torch

# Encodes categorical labels into numerical format (used for label preprocessing)
from sklearn.preprocessing import LabelEncoder

# Calculates the accuracy of a classification model (used for model evaluation)
from sklearn.metrics import accuracy_score

# Defines a custom dataset class for PyTorch (used for handling data)
from torch.utils.data import Dataset

# Creates a DataLoader for efficient batch processing in PyTorch (used for data loading)
from torch.utils.data import DataLoader

# Splits a dataset into training and validation sets (used for data splitting)
from torch.utils.data import random_split

# Represents a multi-dimensional matrix in PyTorch (used for tensor manipulation)
from torch import Tensor

# Implements a linear layer in a neural network (used for defining neural network architecture)
from torch.nn import Linear

# Applies rectified linear unit (ReLU) activation function (used for introducing non-linearity)
from torch.nn import ReLU

# Applies sigmoid activation function (used for binary classification output)
from torch.nn import Sigmoid

# Base class for all neural network modules in PyTorch (used for creating custom models)
from torch.nn import Module

# Stochastic Gradient Descent optimizer (used for model optimization during training)
from torch.optim import SGD

# Binary Cross Entropy Loss function (used for binary classification problems)
from torch.nn import BCELoss

# Initializes weights using Kaiming uniform initialization (used for weight initialization)
from torch.nn.init import kaiming_uniform_

# Initializes weights using Xavier (Glorot) uniform initialization (used for weight initialization)
from torch.nn.init import xavier_uniform_



In [8]:
import pickle
with open('word_sequences.pkl', 'rb') as file:
    word_sequences = pickle.load(file)

with open('char_sequences.pkl', 'rb') as file:
    char_sequences = pickle.load(file)

with open('tashkeel_list.pkl', 'rb') as file:
    labels = pickle.load(file)

In [9]:
# dataset definition
# A custom Dataset class must implement three functions: __init__, __len__, and __getitem__.
class CSVDataset(Dataset):
    # load the dataset
    # The __init__ function is run once when instantiating the Dataset object
    def __init__(self):
   
        self.x = char_sequences
        self.y = labels
        
    # number of rows in the dataset
    # The __len__ function returns the number of samples in our dataset.
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.x))
        train_size = len(self.x) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])


# prepare the dataset
def prepare_data():
    # load the dataset
    dataset = CSVDataset()
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    # The Dataset retrieves our dataset’s features and labels one sample at a time.
    # While training a model, we typically want to pass samples in “minibatches”,
    # reshuffle the data at every epoch to reduce model overfitting,
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return dataset.encoding_mapping, train_dl, test_dl