In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense, Dropout, Input, Lambda
from keras.optimizers import Adam
import tensorflow as tf
import gensim
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import precision_score
import pickle
import json
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf

In [None]:
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

Found GPU at: /device:GPU:0


# `Choosing Best Model`

In [None]:
# Define the models
def create_model(model_type, num_layers, dropout_rate):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length))
    if model_type == 'RNN':
        for i in range(num_layers - 1):
            model.add(SimpleRNN(units=128, return_sequences=True))
        model.add(SimpleRNN(units=128, return_sequences=False))
    elif model_type == 'GRU':
        for i in range(num_layers - 1):
            model.add(GRU(units=128, return_sequences=True))
        model.add(GRU(units=128, return_sequences=False))
    elif model_type == 'LSTM':
        for i in range(num_layers - 1):
            model.add(LSTM(units=128, return_sequences=True))
        model.add(LSTM(units=128, return_sequences=False))
    elif model_type == 'BiLSTM':
        for i in range(num_layers - 1):
            model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
        model.add(Bidirectional(LSTM(units=64, return_sequences=False)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model


In [None]:
# Load the dataset
url = "https://raw.githubusercontent.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/master/urdu-sentiment-corpus-v1.tsv"
data = pd.read_csv(url, sep='\t')
data = data.dropna()

# Preprocess the dataset
texts = data['Tweet'].values
labels = data['Class'].apply(lambda x: 1 if x == 'P' else 0).values

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to make them of equal length
max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.25, random_state=42)

In [None]:
# Define hyperparameters
model_types = ['RNN', 'GRU', 'LSTM', 'BiLSTM']
num_layers = [2, 3]
dropout_rates = [0.3, 0.7]

# Train and evaluate models
results = []
with tf.device(device_name):
    for model_type in model_types:
        for num_layer in num_layers:
            for dropout_rate in dropout_rates:
                model = create_model(model_type, num_layer, dropout_rate)
                model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
                model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
                y_pred = model.predict(X_test).flatten()
                y_pred_binary = np.where(y_pred > 0.5, 1, 0)
                accuracy = accuracy_score(y_test, y_pred_binary)
                precision = precision_score(y_test, y_pred_binary)
                recall = recall_score(y_test, y_pred_binary)
                f1 = f1_score(y_test, y_pred_binary)
                results.append({
                    'Model': model_type,
                    'Num Layers': num_layer,
                    'Dropout Rate': dropout_rate,
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1 Score': f1
                })

# Display results
results_df = pd.DataFrame(results)
print(results_df)


     Model  Num Layers  Dropout Rate  Accuracy  Precision    Recall  F1 Score
0      RNN           2           0.3     0.444   0.414141  0.336066  0.371041
1      RNN           2           0.7     0.556   0.544715  0.549180  0.546939
2      RNN           3           0.3     0.500   0.487805  0.491803  0.489796
3      RNN           3           0.7     0.524   0.511811  0.532787  0.522088
4      GRU           2           0.3     0.636   0.616541  0.672131  0.643137
5      GRU           2           0.7     0.612   0.581699  0.729508  0.647273
6      GRU           3           0.3     0.632   0.607143  0.696721  0.648855
7      GRU           3           0.7     0.612   0.582781  0.721311  0.644689
8     LSTM           2           0.3     0.636   0.630252  0.614754  0.622407
9     LSTM           2           0.7     0.660   0.633094  0.721311  0.674330
10    LSTM           3           0.3     0.624   0.600000  0.688525  0.641221
11    LSTM           3           0.7     0.644   0.646018  0.598

In [None]:
results_df

Unnamed: 0,Model,Num Layers,Dropout Rate,Accuracy,Precision,Recall,F1 Score
0,RNN,2,0.3,0.444,0.414141,0.336066,0.371041
1,RNN,2,0.7,0.556,0.544715,0.54918,0.546939
2,RNN,3,0.3,0.5,0.487805,0.491803,0.489796
3,RNN,3,0.7,0.524,0.511811,0.532787,0.522088
4,GRU,2,0.3,0.636,0.616541,0.672131,0.643137
5,GRU,2,0.7,0.612,0.581699,0.729508,0.647273
6,GRU,3,0.3,0.632,0.607143,0.696721,0.648855
7,GRU,3,0.7,0.612,0.582781,0.721311,0.644689
8,LSTM,2,0.3,0.636,0.630252,0.614754,0.622407
9,LSTM,2,0.7,0.66,0.633094,0.721311,0.67433


In [None]:
sorted_results = results_df.sort_values(by=['F1 Score'], ascending=False).head(1)
sorted_results

Unnamed: 0,Model,Num Layers,Dropout Rate,Accuracy,Precision,Recall,F1 Score
9,LSTM,2,0.7,0.66,0.633094,0.721311,0.67433


# `Implementing Different Embedding`

In [None]:
def create_LSTM_model(embedding_dim, embedding_matrix, max_sequence_length, num_words, num_layers=2, dropout_rate=0.7):

    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))

    for i in range(num_layers - 1):
        model.add(LSTM(units=128, return_sequences=True))
    model.add(LSTM(units=128, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_dim = {}
embedding_matrix = {}

## Word2Vec Loading and Embedding

In [None]:
# Load Google's pre-trained Word2Vec model.
google_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Afnan Hussain/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
embedding_dim['Word2Vec'] = 300  # as in the loaded Word2Vec model

word2vec_embedding_matrix = np.zeros((num_words, embedding_dim['Word2Vec']))
for word, i in word_index.items():
    if word in google_model:
        word2vec_embedding_matrix[i] = google_model[word]

embedding_matrix['Word2Vec'] = word2vec_embedding_matrix

## Glove Loading and Embedding

In [None]:
glove_embeddings_index = {}
with open('C:/Users/Afnan Hussain/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embeddings = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = embeddings


embedding_dim['GloVe'] = 300  # as in the GloVe embeddings

glove_embedding_matrix = np.zeros((num_words, embedding_dim['GloVe']))
for word, i in word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        glove_embedding_matrix[i] = embedding_vector

embedding_matrix['GloVe'] = glove_embedding_matrix

## Fasttext Loading and Embedding

In [None]:
# Load FastText embeddings
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Afnan Hussain/wiki-news-300d-1M.vec', binary=False)

In [None]:
embedding_dim['FastText'] = 300  # as in the FastText embeddings

fasttext_embedding_matrix = np.zeros((num_words, embedding_dim['FastText']))
for word, i in word_index.items():
    if word in fasttext_model:
        fasttext_embedding_matrix[i] = fasttext_model[word]

embedding_matrix['FastText'] = fasttext_embedding_matrix

## Elmo Loading and Embedding and Training

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")


with tf.device('/CPU:0'):
    # Load the pre-trained ELMo model
    elmo = hub.load("https://tfhub.dev/google/elmo/3")

    # Convert the tweet text into embeddings using the loaded model
    X = elmo.signatures["default"](tf.constant(data['Tweet'].tolist()))["elmo"]

    model = Sequential([
        LSTM(128, return_sequences=True, input_shape=(None, 1024)),
        LSTM(128, return_sequences=False),
        Dropout(0.7),
        Dense(1, activation='sigmoid')
    ])

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X.numpy(), data['Class'], test_size=0.2, random_state=42)

# Step 7: Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)

# Step 8: Evaluate the model on test data
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5357142857142857
Precision: 0.5
Recall: 0.01098901098901099
F1 Score: 0.021505376344086023


In [None]:
results = []
# Append results to a list
results.append({
    'Model': 'LSTM',
    'Embedding Type': 'Elmo',
    'Num Layers': 2,
    'Dropout Rate': 0.7,
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

## Model Training and Compile

In [None]:
with tf.device(device_name):
    for embedding_type in ['None','Word2Vec', 'GloVe', 'FastText']:
        if embedding_type=='None':
            model = create_model('BiLSTM', 2, 0.3)
        else:
            model = create_LSTM_model(embedding_dim[embedding_type], embedding_matrix[embedding_type], max_sequence_length, num_words)
        model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0.0)  # Handle undefined precision
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results.append({
            'Model': 'LSTM',
            'Embedding Type' : embedding_type,
            'Num Layers': 2,
            'Dropout Rate': 0.7,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })



In [None]:
# Display results
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Embedding Type,Num Layers,Dropout Rate,Accuracy,Precision,Recall,F1 Score
0,LSTM,Elmo,2,0.7,0.576531,0.586957,0.296703,0.394161
1,LSTM,,2,0.7,0.612,0.601626,0.606557,0.604082
2,LSTM,Word2Vec,2,0.7,0.5,0.493724,0.967213,0.65374
3,LSTM,GloVe,2,0.7,0.528,0.6,0.098361,0.169014
4,LSTM,FastText,2,0.7,0.464,0.455224,0.5,0.476562
