In [None]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_csv("urdu-sentiment-corpus-v1 (2).tsv" , sep='\t')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   1000 non-null   object
 1   Class   999 non-null    object
dtypes: object(2)
memory usage: 15.8+ KB


In [None]:
data.isna().sum()

Tweet    0
Class    1
dtype: int64

In [None]:
data = data.dropna()

In [None]:
data.isna().sum()

Tweet    0
Class    0
dtype: int64

In [None]:
data["Class"].unique()

array(['P', 'N', 'O'], dtype=object)

In [None]:
count_O = data["Class"].value_counts().get('O', 0)
print("Count of 'O':", count_O)

Count of 'O': 20


In [None]:
#dropping rows with O
data = data[data["Class"] != 'O']

In [None]:
data["Class"].unique()

array(['P', 'N'], dtype=object)

In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Tweet'])
sequences = tokenizer.texts_to_sequences(data['Tweet'])

In [None]:
max_len = max([len(seq) for seq in sequences])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
X = pad_sequences(sequences, maxlen=max_len)
y = np.array([1 if label == 'P' else 0 for label in data['Class']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
def create_model(model_type, num_layers, dropout_rate):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_len))

    #Layers
    for _ in range(num_layers - 1):
        if model_type == 'RNN':
            model.add(SimpleRNN(64, return_sequences=True))
        elif model_type == 'GRU':
            model.add(GRU(64, return_sequences=True))
        elif model_type == 'LSTM':
            model.add(LSTM(64, return_sequences=True))
        elif model_type == 'BiLSTM':
            model.add(Bidirectional(LSTM(64, return_sequences=True)))
        model.add(Dropout(dropout_rate))

    # Final layer with return_sequences=False
    if model_type == 'RNN':
        model.add(SimpleRNN(64, return_sequences=False))
    elif model_type == 'GRU':
        model.add(GRU(64, return_sequences=False))
    elif model_type == 'LSTM':
        model.add(LSTM(64, return_sequences=False))
    elif model_type == 'BiLSTM':
        model.add(Bidirectional(LSTM(64, return_sequences=False)))
    model.add(Dropout(dropout_rate))

    # Using sigmoid
    model.add(Dense(1, activation='sigmoid'))

    # Compiling the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model


In [None]:
hyperparameters = [
    {'num_layers': 2, 'dropout': 0.3},
    {'num_layers': 2, 'dropout': 0.7},
    {'num_layers': 3, 'dropout': 0.3},
    {'num_layers': 3, 'dropout': 0.7}
]

In [None]:
results = []


for param in hyperparameters:
    for model_type in ['RNN', 'GRU', 'LSTM', 'BiLSTM']:
        model = create_model(model_type, param['num_layers'], param['dropout'])
        history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=0)
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f_score = f1_score(y_test, y_pred)
        results.append({
            'Model': model_type,
            'Layers': param['num_layers'],
            'Dropout Rate': param['dropout'],
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F-score': f_score
        })



In [None]:
evaluation = pd.DataFrame(results)
evaluation

Unnamed: 0,Model,Layers,Dropout Rate,Accuracy,Precision,Recall,F-score
0,RNN,2,0.3,0.453061,0.421488,0.443478,0.432203
1,GRU,2,0.3,0.591837,0.561983,0.591304,0.576271
2,LSTM,2,0.3,0.604082,0.561644,0.713043,0.628352
3,BiLSTM,2,0.3,0.608163,0.587156,0.556522,0.571429
4,RNN,2,0.7,0.546939,0.514925,0.6,0.554217
5,GRU,2,0.7,0.595918,0.567797,0.582609,0.575107
6,LSTM,2,0.7,0.608163,0.579832,0.6,0.589744
7,BiLSTM,2,0.7,0.595918,0.546512,0.817391,0.655052
8,RNN,3,0.3,0.497959,0.467213,0.495652,0.481013
9,GRU,3,0.3,0.6,0.587629,0.495652,0.537736


In [None]:
max = evaluation.loc[evaluation['F-score'].idxmax()]
print(max)

Model             BiLSTM
Layers                 2
Dropout Rate         0.7
Accuracy        0.595918
Precision       0.546512
Recall          0.817391
F-score         0.655052
Name: 7, dtype: object


In [None]:
bilstm = evaluation[(evaluation['Model'] == 'BiLSTM') & (evaluation['Layers'] == 2) & (evaluation['Dropout Rate'] == 0.7)]


if not bilstm.empty:
    accuracy_bilstm = bilstm['Accuracy'].values[0]
    precision_bilstm = bilstm['Precision'].values[0]
    recall_bilstm = bilstm['Recall'].values[0]
    f_score_bilstm = bilstm['F-score'].values[0]

    print(f"Accuracy: {accuracy_bilstm}")
    print(f"Precision: {precision_bilstm}")
    print(f"Recall: {recall_bilstm}")
    print(f"F-Score: {f_score_bilstm}")

Accuracy: 0.5959183673469388
Precision: 0.5465116279069767
Recall: 0.8173913043478261
F-Score: 0.6550522648083624


In [None]:
#GloVe embeddings
def load_glove_embeddings(embedding_file):
    embeddings_index = {}
    with open(embedding_file, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddings = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embeddings
        return embeddings_index

embedding_dim = 300
glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

#embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

#compile it
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#train it
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)


y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy_glove = accuracy_score(y_test, y_pred)
precision_glove = precision_score(y_test, y_pred)
recall_glove = recall_score(y_test, y_pred)
f_score_glove = f1_score(y_test, y_pred)

print("Accuracy:", accuracy_glove)
print("Precision:", precision_glove)
print("Recall:", recall_glove)
print("F-score:", f_score_glove)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.46938775510204084
Precision: 0.46835443037974683
Recall: 0.9652173913043478
F-score: 0.6306818181818182


In [None]:
# Word2vec embeddings
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

#embedding matrix
embedding_dim_wordvec = 300
embedding_matrix_wordvec = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix_wordvec[i] = word2vec_model[word]

#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim_wordvec, weights=[embedding_matrix_wordvec], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

#compile it
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#train it
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)


y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy_wordvec = accuracy_score(y_test, y_pred)
precision_wordvec = precision_score(y_test, y_pred)
recall_wordvec = recall_score(y_test, y_pred)
f_score_wordvec = f1_score(y_test, y_pred)

print("Accuracy:", accuracy_wordvec)
print("Precision:", precision_wordvec)
print("Recall:", recall_wordvec)
print("F-score:", f_score_wordvec)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.46122448979591835
Precision: 0.46473029045643155
Recall: 0.9739130434782609
F-score: 0.6292134831460674


In [None]:
#FastText embeddings
def load_fasttext_embeddings(embedding_file):
    embeddings_index = {}
    with open(embedding_file, encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            embeddings = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embeddings
    return embeddings_index

embedding_dim = 300
fasttext_embeddings = load_fasttext_embeddings('wiki-news-300d-1M.vec')

#embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = fasttext_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

#compile it
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#train it
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy_fasttext = accuracy_score(y_test, y_pred)
precision_fasttext = precision_score(y_test, y_pred)
recall_fasttext = recall_score(y_test, y_pred)
f_score_fasttext = f1_score(y_test, y_pred)

print("Accuracy:", accuracy_fasttext)
print("Precision:", precision_fasttext)
print("Recall:", recall_fasttext)
print("F-score:", f_score_fasttext)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6081632653061224
Precision: 0.6
Recall: 0.4956521739130435
F-score: 0.5428571428571429


In [None]:
#Elmo
import tensorflow as tf
import tensorflow_hub as hub

data['Class'] = data['Class'].replace({'P': 1, 'N': 0})


#loading the pre-trained ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

#converting the Tweets into embeddings
X = elmo.signatures["default"](tf.constant(data['Tweet'].tolist()))["elmo"]

#model
model = Sequential([
    Bidirectional(LSTM(64, return_sequences=True), input_shape=(None, 1024)),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

X_train, X_test, y_train, y_test = train_test_split(X.numpy(), data['Class'], test_size=0.2, random_state=42)

#compling it
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)


y_pred = (model.predict(X_test) > 0.5).astype(int)


accuracy_elmo = accuracy_score(y_test, y_pred)
precision_elmo  = precision_score(y_test, y_pred)
recall_elmo  = recall_score(y_test, y_pred)
f1_elmo  = f1_score(y_test, y_pred)


print("Accuracy:", accuracy_elmo)
print("Precision:", precision_elmo)
print("Recall:", recall_elmo)
print("F1 Score:", f1_elmo)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6020408163265306
Precision: 0.5970149253731343
Recall: 0.43956043956043955
F1 Score: 0.5063291139240506


In [None]:
evaluation_data = {
    'Model': ['BiLSTM (Without Embedding)', 'BiLSTM GloVe', 'BiLSTM Word2Vec', 'BiLSTM FastText' , 'BiLSTM Elmo'],
    'Accuracy': [accuracy_bilstm, accuracy_glove, accuracy_wordvec, accuracy_fasttext ,accuracy_elmo],
    'Precision': [precision_bilstm, precision_glove, precision_wordvec, precision_fasttext,precision_elmo],
    'Recall': [recall_bilstm, recall_glove, recall_wordvec, recall_fasttext,recall_elmo],
    'F-score': [f_score_bilstm, f_score_glove, f_score_wordvec, f_score_fasttext,f1_elmo]
}


df = pd.DataFrame(evaluation_data)

df.set_index('Model', inplace=True)

print(df)

                            Accuracy  Precision    Recall   F-score
Model                                                              
BiLSTM (Without Embedding)  0.595918   0.546512  0.817391  0.655052
BiLSTM GloVe                0.469388   0.468354  0.965217  0.630682
BiLSTM Word2Vec             0.461224   0.464730  0.973913  0.629213
BiLSTM FastText             0.608163   0.600000  0.495652  0.542857
BiLSTM Elmo                 0.602041   0.597015  0.439560  0.506329
