# Import Libraries

In [2]:
import os
import re

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences





# Load corpus

### Load Positives

In [3]:
train_tweets, train_labels = [], []

pos = os.getcwd() + '/corpus/arabic_tweets/pos/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("positive")

### Load Negatives

In [4]:
# Get the txt file negative tweet
pos = os.getcwd() + '/corpus/arabic_tweets/neg/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("negative")

### Build a dataframe

In [5]:
train_dic = {
    'Tweets' : train_tweets,
    'Labels' : train_labels
}

train_corpus = pd.DataFrame(train_dic)
train_corpus.head()

Unnamed: 0,Tweets,Labels
0,نحن الذين يتحول كل ما نود أن نقوله إلى دعاء لل...,positive
1,وفي النهاية لن يبقىٰ معك آحدإلا من رأىٰ الجمال...,positive
2,نمش ننوم ما دا ديل ولادنا 💚\n,positive
3,تعدل النت وشفتها ✌\n,positive
4,"🎥 المهمة الأولى في ""جدة"" ✔💪🏼 💙 #الهلال #فيديو_...",positive


# EDA

##### Explore your dataset

In [6]:
train_corpus.shape

(58164, 2)

In [7]:
train_corpus.tail()

Unnamed: 0,Tweets,Labels
58159,#أمي فقيدتي وأن مرت الأيام.. وبدأ الجميع بنسيا...,negative
58160,مره في السنه ما كل اسبوع عاد 😢\n,negative
58161,#يوم_الجمعه اسال الله عز وجل في هذا اليوم الفض...,negative
58162,يعني الغاء العقود الاولي كانت تسكيته لنا شسالف...,negative
58163,الفار 🐀 في عهد خليل جلال 😲\n,negative


In [8]:
train_corpus.describe().round().T

Unnamed: 0,count,unique,top,freq
Tweets,58164,36419,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتوي...,473
Labels,58164,2,positive,29262


In [9]:
train_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58164 entries, 0 to 58163
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweets  58164 non-null  object
 1   Labels  58164 non-null  object
dtypes: object(2)
memory usage: 908.9+ KB


In [10]:
train_corpus.isna().sum()

Tweets    0
Labels    0
dtype: int64

In [11]:
train_corpus["Labels"].value_counts()

Labels
positive    29262
negative    28902
Name: count, dtype: int64

In [12]:
train_corpus.duplicated().sum()

21619

# Data Preprocessing

### Shuffle all rows

In [13]:
df = train_corpus.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Tweets,Labels
0,افا ليش 💔\n,negative
1,#يسقط_حكم_تميم نصر_اللات المكنى بزميره_ابليس ي...,positive
2,گم #ﻫﻲ ﺻﻌبة لحظﺎﺕ #ﺍﻹﺷتيﺎﻕ ﻟﻤﻦ😔 ﻻ ﻳﻤگن ﺭوﻳﺘﻬﻢ ...,negative
3,اوه مااي قادد 😱😱 نهاية ترايجيدية بكل معنى الكل...,negative
4,سيم سيم 😢\n,negative
...,...,...
58159,سحب على مبلغ مالي 💰 لمتابعي #كشكول 👍🏻 المطلوب:...,positive
58160,“علامة حب الله 🌱 قال ابن أبي الحواري : علامة ح...,positive
58161,🛑همتكم مساعدتها في تعقيم واخصاء العدد٢ عشان تر...,negative
58162,سمراء تكحلت فأربكت قلب ذاك الذي تاب عن العشق 💚\n,positive


### Data cleaning

**Hint: remove URLs, Hashtags, alphanumeric characters, punctuation marks, stop words, extra spaces**

In [14]:
URL_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
hashtag_pattern = r"#\w+"
mention_pattern = r"@\w+"
alphanumeric_pattern = r"\w*\d\w*"
punctuation_pattern = r"[^\w\s]"
retweet_pattern = r"^RT[\s]+"

In [15]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
    return frozenset(stop_set)

def process_text(text, stop_words):
    # Remove URLs
    text = re.sub(URL_pattern, '', text)
    
    # Remove hashtags
    text = re.sub(hashtag_pattern, '', text)
    
    # Remove mention
    text = re.sub(mention_pattern, '', text)

    # Remove alphanumeric characters
    text = re.sub(alphanumeric_pattern, '', text)

    # Remove punctuation marks
    text = re.sub(punctuation_pattern, '', text)
    
    # Remove Retweet marks
    text = re.sub(retweet_pattern, '', text)

    # Remove stop words using the provided set
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = ' '.join(text.split())
    return text

#### Now Clean your text using above function or implement it from scrach

In [16]:
stopwords=load_stopwords(R"C:\Users\HP\Desktop\jup\RNN_Lab1\corpus\Stop_Words.txt")
c=0
for i in df["Tweets"]:
  df.loc[c,"Tweets"]=process_text(i,stopwords)
  c+=1

In [97]:
df

Unnamed: 0,Tweets,Labels,length
0,افا ليش,negative,2
1,نصر_اللات المكنى بزميره_ابليس يقول خميني عربي ...,positive,17
2,گم ﺻﻌبة لحظﺎﺕ ﻟﻤﻦ ﻻ ﻳﻤگن ﺭوﻳﺘﻬﻢ ﺣﺘﻰ,negative,8
3,اوه مااي قادد ترايجيدية بكل معنى الكلمة ابددعو...,negative,16
4,سيم سيم,negative,2
...,...,...,...
58159,سحب مبلغ مالي لمتابعي المطلوب شي بس رتويت السح...,positive,12
58160,علامة حب الله ابن أبي الحواري علامة حب الله حب...,positive,20
58161,همتكم مساعدتها تعقيم واخصاء عشان تروح تبني وتت...,negative,11
58162,سمراء تكحلت فأربكت قلب ذاك تاب العشق,positive,7


#### Extra: you could do stemming or lemmatization before training

# Tokenizer

In [19]:
df["length"]=df["Tweets"].apply(lambda x:len(x.split(" ")))
df

Unnamed: 0,Tweets,Labels,length
0,افا ليش,negative,2
1,نصر_اللات المكنى بزميره_ابليس يقول خميني عربي ...,positive,17
2,گم ﺻﻌبة لحظﺎﺕ ﻟﻤﻦ ﻻ ﻳﻤگن ﺭوﻳﺘﻬﻢ ﺣﺘﻰ,negative,8
3,اوه مااي قادد ترايجيدية بكل معنى الكلمة ابددعو...,negative,16
4,سيم سيم,negative,2
...,...,...,...
58159,سحب مبلغ مالي لمتابعي المطلوب شي بس رتويت السح...,positive,12
58160,علامة حب الله ابن أبي الحواري علامة حب الله حب...,positive,20
58161,همتكم مساعدتها تعقيم واخصاء عشان تروح تبني وتت...,negative,11
58162,سمراء تكحلت فأربكت قلب ذاك تاب العشق,positive,7


In [51]:
mean=df["length"].mean().astype(int)
mean

9

In [98]:
corpus = df["Tweets"].astype(str).tolist()

In [99]:
len(corpus)

58164

In [54]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(corpus) 
sequences = tokenizer.texts_to_sequences(corpus)
num_classes = len(tokenizer.word_index) + 1

print("Total number of words: ", num_classes)

Total number of words:  72840


# Text to sequence

In [55]:
input_sequences = []
labels = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence[:-1])
        labels.append(n_gram_sequence[-1])

# Pad sequence

In [56]:
input_sequences = pad_sequences(input_sequences, maxlen=mean)

# RNN Model

In [57]:
split_ratio = 0.8 # 80% for the train
split_index = int(split_ratio * len(input_sequences))
x_train, y_train = input_sequences[:split_index], labels[:split_index]
x_test, y_test = input_sequences[split_index:], labels[split_index:] # 20 for the test

In [64]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, tokenizer, sequences, labels, batch_size, max_sequence_length, num_classes):
        self.tokenizer = tokenizer
        self.sequences = sequences
        self.labels = labels
        self.batch_size = batch_size
        self.max_sequence_length = max_sequence_length
        self.num_classes = num_classes

    def __len__(self):
        return len(self.sequences) // self.batch_size

    def __getitem__(self, index):
        batch_indices = np.random.choice(len(self.sequences), size=self.batch_size, replace=False)
        batch_sequences = [self.sequences[i] for i in batch_indices]
        batch_labels = [self.labels[i] for i in batch_indices]
        x = pad_sequences(batch_sequences, maxlen=self.max_sequence_length)
        y = self.one_hot_encode(batch_labels)

        return x, y

    def one_hot_encode(self, labels):
        encoded_labels = np.zeros((len(labels), self.num_classes), dtype=np.float32)
        for i, label in enumerate(labels):
            encoded_labels[i, label] = 1.0
        return encoded_labels

# Split data to train and test

In [65]:
train_data_generator = DataGenerator(tokenizer, x_train, y_train, 32, mean, num_classes)
test_data_generator = DataGenerator(tokenizer, x_test, y_test, 32, mean, num_classes)

In [67]:
model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=100, input_length=mean))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_data_generator, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1cf653e97f0>

# the time to run the RNN on the train set is 554 minutes ≈  9.233 hours.

# LSTM Model

In [68]:
model_LSTM = Sequential()
model_LSTM.add(Embedding(input_dim=num_classes, output_dim=100, input_length=mean))
model_LSTM.add(LSTM(units=128))
model_LSTM.add(Dense(units=num_classes, activation='softmax'))

model_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_LSTM.fit(train_data_generator, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1cf671d52e0>

#  the time to run the LSTM on the train set is 214 minutes ≈ 3.5667 hours.

# Evaulation and Comparsion

In [69]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 9, 100)            7284000   
                                                                 
 simple_rnn_10 (SimpleRNN)   (None, 9, 100)            20100     
                                                                 
 simple_rnn_11 (SimpleRNN)   (None, 100)               20100     
                                                                 
 dense_6 (Dense)             (None, 72840)             7356840   
                                                                 
Total params: 14681040 (56.00 MB)
Trainable params: 14681040 (56.00 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [71]:
model_LSTM.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 9, 100)            7284000   
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_7 (Dense)             (None, 72840)             9396360   
                                                                 
Total params: 16797608 (64.08 MB)
Trainable params: 16797608 (64.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [72]:
from tensorflow import keras
keras.models.save_model(model, "model.h5")
keras.models.save_model(model_LSTM, "model_LSTM.h5")


  keras.models.save_model(model, "model.h5")
  keras.models.save_model(model_LSTM, "model_LSTM.h5")


In [73]:
loss, accuracy = model.evaluate(test_data_generator)
print("Evaulation for RNN model:")
print(loss)
print(accuracy)

Evaulation for RNN model:
6.536467552185059
0.4504719078540802


In [74]:
loss_LSTM, accuracy_LSTM = model_LSTM.evaluate(test_data_generator)
print("Evaulation for LSTM model:")
print(loss_LSTM)
print(accuracy_LSTM)

Evaulation for LSTM model:
6.827205657958984
0.3848962187767029


In [89]:
def predict_next_word(seed_text, num_of_words):
    for _ in range(num_of_words):
        input_sequence = tokenizer.texts_to_sequences([seed_text])
        input_sequence = pad_sequences(input_sequence, maxlen=mean) 
        predictions = model.predict(input_sequence)

        predicted_word_index = predictions.argmax(axis=1)
        predicted_word = tokenizer.index_word[predicted_word_index[0]]    
        seed_text +=  ' ' + predicted_word
    return seed_text

In [90]:
def predict_next_word_LSTM(seed_text, num_of_words):
    for _ in range(num_of_words):
        input_sequence = tokenizer.texts_to_sequences([seed_text])
        input_sequence = pad_sequences(input_sequence, maxlen=mean) 
        predictions = model_LSTM.predict(input_sequence)

        predicted_word_index = predictions.argmax(axis=1)
        predicted_word = tokenizer.index_word[predicted_word_index[0]]    
        seed_text +=  ' ' + predicted_word
    return seed_text

In [84]:
seed_words = ["السعودية", "النصر", "قال", "علي", "الهلال"]

# RNN 

In [85]:
import random
samples = dict()

for sen in seed_words:    
    samples.update({sen: predict_next_word(sen, random.randint(1, 9))})





In [86]:
pd.DataFrame(samples.items(), columns=["start", "predicted"])

Unnamed: 0,start,predicted
0,السعودية,السعودية مسوي مكان فاضي ايش يسوي السبت يعني هسه
1,النصر,النصر ي آجمل صباحات العمر حبا
2,قال,قال ريال ل فائز ماذا
3,علي,علي أبي طالب رضي الله عنه رسول الله
4,الهلال,الهلال ينتصر لكنه بعيد جدا


# LSTM

In [91]:
import random
samples = dict()

for sen in seed_words:    
    samples.update({sen: predict_next_word_LSTM(sen, random.randint(1, 9))})



In [92]:
pd.DataFrame(samples.items(), columns=["start", "predicted"])

Unnamed: 0,start,predicted
0,السعودية,السعودية الله الله أكبر لاحول ولاقوة بالله
1,النصر,النصر الهلال ايفون xr مقدم أحد الشخصيات الشخصيات
2,قال,قال يا الله
3,علي,علي الله الله وملائكته يصلون النبي ﷺ ﷺ
4,الهلال,الهلال إذن بكل بساطة إنتظار هدايا الحكم


In [94]:
print("RNN :")
print("Time for training :  9.233 hours")
print("loss",loss)
print("accuracy",accuracy)
print("#"*30)
print("LSTM :")
print("Time for training : 3.5667 hours")
print("loss",loss_LSTM)
print("accuracy",accuracy_LSTM)

RNN :
Time for training :  9.233 hours
loss 6.536467552185059
accuracy 0.4504719078540802
##############################
LSTM :
Time for training : 3.5667 hours
loss 6.827205657958984
accuracy 0.3848962187767029


The RNN model is considered the best because it has the highest number of epochs, totaling 10 epochs. Conversely, the LSTM model is considered the worst as it has the lowest number of epochs, specifically 3 epochs.
if we increase the number of epochs we will get better result in both models.