# **Importing Required Modules**

In [74]:
import pandas as pd
from collections import Counter
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# **Loading Dataset**

## Defining Class for Dataset

In [65]:
class UrduSentimentDataset:
    def __init__(self, csv_file, max_vocab_size=None):
        self.df = pd.read_csv(csv_file, delimiter='\t')
        self.df['Class'] = self.df['Class'].map({'P': 1, 'N': 0})
        self.tokenize_and_pad(max_vocab_size)

    def tokenize_and_pad(self, max_vocab_size):
        all_text = ' '.join(self.df['Tweet'])
        words = all_text.split()
        word_counts = Counter(words)
        sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        if max_vocab_size is not None:
            sorted_vocab = sorted_vocab[:max_vocab_size]
        self.int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
        self.vocab_to_int = {w: k for k, w in self.int_to_vocab.items()}

        self.encoded_tweets = [[self.vocab_to_int.get(word, 0) for word in tweet.split()] for tweet in self.df['Tweet']]

        max_len = max([len(tweet) for tweet in self.encoded_tweets])
        self.padded_tweets = np.array([tweet + [0]*(max_len-len(tweet)) for tweet in self.encoded_tweets])

    def get_data(self):
        return self.padded_tweets, self.df['Class'], self.vocab_to_int

    def print_samples(self, num_samples=5):
        print("Random samples from the dataset:")
        samples_indices = np.random.choice(len(self.df), num_samples, replace=False)
        for idx in samples_indices:
            tweet = self.df.loc[idx, 'Tweet']
            label = self.df.loc[idx, 'Class']
            print(f"Tweet: {tweet} | Label: {'Positive' if label == 1 else 'Negative'}")

    def preprocess_data(self, test_size=0.25, random_state=42):
        X, y = self.padded_tweets, self.df['Class']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        X_train_clean = X_train.copy()
        y_train_clean = y_train.copy()
        X_test_clean = X_test.copy()
        y_test_clean = y_test.copy()

        nan_mask_train = ~np.isnan(y_train_clean)
        X_train_clean = X_train_clean[nan_mask_train]
        y_train_clean = y_train_clean[nan_mask_train]

        nan_mask_test = ~np.isnan(y_test_clean)
        X_test_clean = X_test_clean[nan_mask_test]
        y_test_clean = y_test_clean[nan_mask_test]

        y_train_clean = y_train_clean.astype(int)
        y_test_clean = y_test_clean.astype(int)

        y_train_clean = to_categorical(y_train_clean, num_classes=2)
        y_test_clean = to_categorical(y_test_clean, num_classes=2)

        return X_train_clean, X_test_clean, y_train_clean, y_test_clean

## Verifying the Dataset

In [66]:
URL = 'https://raw.githubusercontent.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/master/urdu-sentiment-corpus-v1.tsv'
dataset = pd.read_csv(URL, delimiter='\t')
dataset.to_csv('urdu-sentiment-corpus-v1.tsv', sep='\t', index=False)

dataset = UrduSentimentDataset('urdu-sentiment-corpus-v1.tsv')
dataset.print_samples(num_samples=5)

Random samples from the dataset:
Tweet:  آپ کی بات ٹھیک ہو سکتی ہے ،کیونکہ یہ تو کویؑ خاتون ہی صحیح حقیقت حال واضح کر سکتی ہے ۔شکریہ | Label: Positive
Tweet: عابدشُرلی وہ شرُلی ہے جو دھماکہ کم کرتی ہے بدبُو زیادہ پھیلاتی ہے  | Label: Negative
Tweet: لاہور: آئی جی پنجاب مشتاق سکھیرا کی زیر صدارت آر پی او کانفرنس۔  | Label: Positive
Tweet: آسٹریلوی ماہرین کا طبی میدان میں اہم کارنامہ ، مردہ دل کی کامیاب پیوند کاری  | Label: Positive
Tweet: یکم محرم الحرام یوم فاروق اعظم رضی الله عنه ملک بھر میں مذهبی عقیدت و احترام سے منایا جاے گا سیدنا عمر فاروق | Label: Positive


# **Training Process**

## Recurrent Neural Networks (RNN)

In [67]:
hyperparameters = [
    {'num_layers': 2, 'dropout_rate': 0.3},
    {'num_layers': 2, 'dropout_rate': 0.7},
    {'num_layers': 3, 'dropout_rate': 0.3},
    {'num_layers': 3, 'dropout_rate': 0.7}
]

results = []

for params in hyperparameters:
    num_layers = params['num_layers']
    dropout_rate = params['dropout_rate']

    X_train, X_test, y_train, y_test = dataset.preprocess_data()

    model = models.Sequential()
    model.add(layers.SimpleRNN(units=64, input_shape=(X_train.shape[1], 1), dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    for _ in range(num_layers - 1):
        model.add(layers.SimpleRNN(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    model.add(layers.SimpleRNN(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, kernel_initializer='he_normal'))
    model.add(layers.Flatten())
    model.add(layers.Dense(2, activation='softmax'))

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    print(f"Training with {num_layers} layers and dropout rate {dropout_rate}")
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=0)

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    results.append({
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2)
    })

results_df = pd.DataFrame(results)
results_df

Training with 2 layers and dropout rate 0.3
Training with 2 layers and dropout rate 0.7
Training with 3 layers and dropout rate 0.3
Training with 3 layers and dropout rate 0.7


Unnamed: 0,num_layers,dropout_rate,accuracy,precision,recall,f1_score
0,2,0.3,0.48,0.49,0.5,0.34
1,2,0.7,0.53,0.55,0.52,0.44
2,3,0.3,0.56,0.59,0.55,0.51
3,3,0.7,0.53,0.54,0.54,0.53


## Gated Recurrent  Unit (GRU)

In [68]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

hyperparameters = [
    {'num_layers': 2, 'dropout_rate': 0.3},
    {'num_layers': 2, 'dropout_rate': 0.7},
    {'num_layers': 3, 'dropout_rate': 0.3},
    {'num_layers': 3, 'dropout_rate': 0.7}
]

results = []

for params in hyperparameters:
    num_layers = params['num_layers']
    dropout_rate = params['dropout_rate']

    X_train, X_test, y_train, y_test = dataset.preprocess_data()

    model = models.Sequential()
    model.add(layers.GRU(units=64, input_shape=(X_train.shape[1], 1), dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    for _ in range(num_layers - 1):
        model.add(layers.GRU(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    model.add(layers.GRU(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, kernel_initializer='he_normal'))
    model.add(layers.Flatten())
    model.add(layers.Dense(2, activation='softmax'))

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    print(f"Training with {num_layers} layers and dropout rate {dropout_rate}")
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=0)

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    results.append({
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2)
    })

results_df = pd.DataFrame(results)
results_df

Training with 2 layers and dropout rate 0.3
Training with 2 layers and dropout rate 0.7


  _warn_prf(average, modifier, msg_start, len(result))


Training with 3 layers and dropout rate 0.3
Training with 3 layers and dropout rate 0.7


Unnamed: 0,num_layers,dropout_rate,accuracy,precision,recall,f1_score
0,2,0.3,0.46,0.42,0.48,0.36
1,2,0.7,0.52,0.26,0.5,0.34
2,3,0.3,0.44,0.39,0.45,0.36
3,3,0.7,0.48,0.49,0.5,0.36


## Long Short Term Memory (LSTM)

In [69]:
hyperparameters = [
    {'num_layers': 2, 'dropout_rate': 0.3},
    {'num_layers': 2, 'dropout_rate': 0.7},
    {'num_layers': 3, 'dropout_rate': 0.3},
    {'num_layers': 3, 'dropout_rate': 0.7}
]

results = []

for params in hyperparameters:
    num_layers = params['num_layers']
    dropout_rate = params['dropout_rate']

    X_train, X_test, y_train, y_test = dataset.preprocess_data()

    model = models.Sequential()
    model.add(layers.LSTM(units=64, input_shape=(X_train.shape[1], 1), dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    for _ in range(num_layers - 1):
        model.add(layers.LSTM(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal'))
    model.add(layers.LSTM(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, kernel_initializer='he_normal'))
    model.add(layers.Flatten())
    model.add(layers.Dense(2, activation='softmax'))

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    print(f"Training with {num_layers} layers and dropout rate {dropout_rate}")
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=0)

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    results.append({
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2)
    })

results_df = pd.DataFrame(results)
results_df

Training with 2 layers and dropout rate 0.3
Training with 2 layers and dropout rate 0.7
Training with 3 layers and dropout rate 0.3
Training with 3 layers and dropout rate 0.7


Unnamed: 0,num_layers,dropout_rate,accuracy,precision,recall,f1_score
0,2,0.3,0.44,0.44,0.44,0.44
1,2,0.7,0.42,0.42,0.42,0.41
2,3,0.3,0.51,0.49,0.5,0.41
3,3,0.7,0.44,0.43,0.44,0.43


## Bi-Directional Long Short Term (BiLSTM)

In [76]:
hyperparameters = [
    {'num_layers': 2, 'dropout_rate': 0.3},
    {'num_layers': 2, 'dropout_rate': 0.7},
    {'num_layers': 3, 'dropout_rate': 0.3},
    {'num_layers': 3, 'dropout_rate': 0.7}
]

results = []

for params in hyperparameters:
    num_layers = params['num_layers']
    dropout_rate = params['dropout_rate']

    X_train, X_test, y_train, y_test = dataset.preprocess_data()

    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)

    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

    model = models.Sequential()
    model.add(layers.Bidirectional(layers.LSTM(units=64, input_shape=(X_train.shape[1], 1), dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal')))
    for _ in range(num_layers - 1):
        model.add(layers.Bidirectional(layers.LSTM(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True, kernel_initializer='he_normal')))
    model.add(layers.Bidirectional(layers.LSTM(units=64, dropout=dropout_rate, recurrent_dropout=dropout_rate, kernel_initializer='he_normal')))
    model.add(layers.Flatten())
    model.add(layers.Dense(2, activation='softmax'))

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    print(f"Training with {num_layers} layers and dropout rate {dropout_rate}")
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=0)

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    results.append({
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1, 2)
    })

results_df = pd.DataFrame(results)
results_df

Training with 2 layers and dropout rate 0.3
Training with 2 layers and dropout rate 0.7
Training with 3 layers and dropout rate 0.3
Training with 3 layers and dropout rate 0.7


Unnamed: 0,num_layers,dropout_rate,accuracy,precision,recall,f1_score
0,2,0.3,0.46,0.45,0.47,0.41
1,2,0.7,0.4,0.4,0.4,0.4
2,3,0.3,0.47,0.44,0.49,0.35
3,3,0.7,0.42,0.41,0.43,0.39
