Name: Abdullah Tahir\
CMS: 385714\
Section: B


In [1]:
import pandas as pd
#reading our dataset into a dataframe
df = pd.read_csv('urdu-sentiment-corpus-v1.tsv', delimiter='\t')
df

Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
2,ٹویٹر کا خیال کیسے آیا ؟,O
3,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
4,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P
...,...,...
995,اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔,P
996,چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ ...,P
997,واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجو...,P
998,اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کا...,P


In [2]:
#visualizing values for each class
df['Class'].value_counts()

Class
N    499
P    480
O     20
Name: count, dtype: int64

In [3]:
#removing examples with insignificant count
df = df.drop(df[df['Class'] == 'O'].index)
df = df.dropna()

In [4]:
#affirming that the dataset was indeed changed.
df['Class'].value_counts()

Class
N    499
P    480
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
from nltk.tokenize import word_tokenize

nltk.download('punkt')

#cleaning the tweet data
def clean_tweet(tweet):
    #removing common stopwords from urdu in unicode
    cleaned_tweet = re.sub(r"[^\u0600-\u06FF\u0750-\u077F#@ ]", "", tweet)
    return cleaned_tweet

#apply the clean_tweet method to the tweet column of the dataframe to return cleaned tweets
df['Cleaned_Tweet'] = df['Tweet'].apply(clean_tweet)

#tokenizing the cleaned tweets
df['Tokenized'] = df['Cleaned_Tweet'].apply(word_tokenize)

#initializing tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Tokenized'])

#converting the text to sequences
sequences = tokenizer.texts_to_sequences(df['Tokenized'])

#padding sequences to hav euniform length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

#mapping text labels to numeric values
label_mapping = {'P': 1, 'N': 0}
df['Class'] = df['Class'].map(label_mapping)

#converting class to numeric type after mapping
df['Class'] = df['Class'].astype(int)
#getting labels from the numeric values of the dataframe column
labels = df['Class'].values

#defining train test split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.25, random_state=42
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
#defining vocab size and embedding size
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 64


In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dropout


# Create the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_seq_length))
model.add(LSTM(32,return_sequences=True))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.000075), loss='binary_crossentropy', metrics=['accuracy'])

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=5, verbose=1, min_delta=0.0001, cooldown=0, min_lr=0.0)

# Train the model
model.fit(X_train, y_train, epochs=40, validation_split=0.2, callbacks=[lr_scheduler])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 5.6250002671731636e-05.
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 11: ReduceLROnPlateau reducing learning rate to 4.218750200379873e-05.
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 16: ReduceLROnPlateau reducing learning rate to 3.164062582072802e-05.
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 21: ReduceLROnPlateau reducing learning rate to 2.3730469365546014e-05.
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 26: ReduceLROnPlateau reducing learning rate to 1.7797852706280537e-05.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 31: ReduceLROnPlateau reducing learning rate to 1.3348389529710403e-05.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 36: ReduceLROnPlateau reducing learning rate to 1.0011292488343315e-05.
Epoch 37/40
Epoch 38/40
Epoch 39/4

In [17]:
model.save('best_model.keras')