Since my dataset is not on my Drive, I uploaded it directly from my device to Colab

In [2]:
from google.colab import files
uploaded = files.upload()


Saving ar_reviews_100k.csv to ar_reviews_100k.csv


Importation of needed libraries

In [3]:
!pip install gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D




Loading and Preprocessing the Dataset

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer

# Loading the dataset
df = pd.read_csv("ar_reviews_100k.csv", encoding='utf-8', sep='\t')

# Displaying the first few rows of the dataset so we understand its structure
print("Sample of the dataset:")
print(df.head())

# Printing the column names in the dataset
print("\nColumn names in the dataset:")
print(df.columns)

# Encoding the sentiment labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Handling Missing Values (if any)
df = df.dropna()

# Text Cleaning: Remove unnecessary characters
df['text'] = df['text'].str.replace('[^\w\s]', '')

# Text Tokenization
df['text'] = df['text'].apply(word_tokenize)

# Removing Stopwords
stop_words = set(stopwords.words('arabic'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stemming
stemmer = ISRIStemmer()
df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Spliting the dataset into training sets and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Displaying the shape of the training sets and testing sets
print("\nShape of training data:", train_data.shape)
print("Shape of testing data:", test_data.shape)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Sample of the dataset:
      label                                               text
0  Positive  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1  Positive  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...
2  Positive  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...
3  Positive  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...
4  Positive  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...

Column names in the dataset:
Index(['label', 'text'], dtype='object')


  df['text'] = df['text'].str.replace('[^\w\s]', '')



Shape of training data: (79999,)
Shape of testing data: (20000,)


Tokenization of the text and Padding of the sequences


In [5]:
# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)

# Converting the text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

# Padding the sequences
max_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

Building and Training the RNN Model

In [None]:
# RNN Model
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model_rnn.add(Bidirectional(LSTM(64, return_sequences=True)))
model_rnn.add(Bidirectional(LSTM(64)))
model_rnn.add(Dense(64, activation='relu'))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(1, activation='sigmoid'))

# Compiling the model
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Printing the model summary
print("RNN Model Summary:")
model_rnn.summary()

# Training the model
history_rnn = model_rnn.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_split=0.2)


Building and Training the CNN Model

In [None]:
# CNN Model
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=4))
model_cnn.add(LSTM(100))
model_cnn.add(Dense(1, activation='sigmoid'))

# Compiling the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Printing the model summary
print("CNN Model Summary:")
model_cnn.summary()

# Training the model
history_cnn = model_cnn.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_split=0.2)


Evaluation of both Models

In [None]:
# Evaluation of the RNN model
loss, accuracy = model_rnn.evaluate(test_padded, test_labels)
print(f'RNN Model - Test Accuracy: {accuracy * 100:.2f}%')

# Evaluation of the CNN model
loss, accuracy = model_cnn.evaluate(test_padded, test_labels)
print(f'CNN Model - Test Accuracy: {accuracy * 100:.2f}%')

