In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/CompanyReviews.csv')
df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
pd.set_option("display.max_columns",None)
df.head()

Unnamed: 0.1,Unnamed: 0,review_description,rating,company
0,0,رائع,1,talbat
1,1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشك...,1,talbat
2,2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال با...,-1,talbat
3,3,لماذا لا يمكننا طلب من ماكدونالدز؟,-1,talbat
4,4,البرنامج بيظهر كل المطاعم و مغلقه مع انها بتكو...,-1,talbat


In [3]:
import re

# Regex: Arabic Unicode block
arabic_pattern = re.compile(r'[\u0600-\u06FF]')

# Keep rows that DO NOT contain Arabic
df_no_arabic = df[~df["review_description"].apply(lambda x: bool(arabic_pattern.search(str(x))))].reset_index(drop=True)

print(df_no_arabic.head())


   Unnamed: 0       review_description  rating company
0          44                        👎      -1  talbat
1          67  Wo 🙌🙌💋💋💋💋💋💋💙💙💙❤❤💖💖💖👍👍👍👌       1  talbat
2          73                3w2d32d32       0  talbat
3          79                       👍👏       1  talbat
4          81                      ugg       1  talbat


In [4]:
df = df[df["review_description"].apply(lambda x: bool(arabic_pattern.search(str(x))))].reset_index(drop=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38100 entries, 0 to 38099
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          38100 non-null  int64 
 1   review_description  38100 non-null  object
 2   rating              38100 non-null  int64 
 3   company             38100 non-null  object
dtypes: int64(2), object(2)
memory usage: 1.2+ MB


In [6]:
df.duplicated().sum()

0

In [7]:
df = df[df['rating'] != 0].reset_index(drop=True)
df['rating'].value_counts(), df['company'].value_counts()

(rating
  1    22349
 -1    13874
 Name: count, dtype: int64,
 company
 talbat           29007
 swvl              4202
 telecom_egypt     1901
 venus              263
 Raya               252
 TMG                232
 elsewedy           140
 hilton              94
 capiter             70
 Ezz Steel           42
 nestle              17
 domty                3
 Name: count, dtype: int64)

In [8]:
# !pip install langdetect
# !pip install googletrans==4.0.0-rc1

In [9]:
# from langdetect import detect
# from googletrans import Translator

# def translate_if_not_arabic(text):
#     try:
#         if detect(text) != "ar":
#             translator = Translator()
#             text = translator.translate(text, src="auto", dest="ar").text
#     except:
#         pass
#     return text

In [10]:
def normalize_franco(text):
    FRANCO_DICT = {
        "2": "أ", "3": "ع", "4": "غ", "5": "خ",
        "6": "ط", "7": "ح", "8": "ق", "9": "ص"
    }
    for k, v in FRANCO_DICT.items():
        text = re.sub(k, v, text)
    return text

In [11]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub('ؤ', 'و', text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'(.)\1+', r'\1\1', text)
    return text

In [12]:
# !pip install pyarabic
# !pip install emoji

In [13]:
import pyarabic.araby as araby
import string, emoji

def clean_text(text):
    text = araby.strip_tashkeel(text)
    text = re.sub(r"[A-Za-z0-9]", " ", text)  # Latin + digits
    text = re.sub(rf"[{string.punctuation}]", " ", text)  # punctuation
    text = emoji.replace_emoji(text, replace="")  # remove emojis
    text = re.sub(r"\s+", " ", text).strip()  # extra spaces
    return text

In [14]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    nltk.download("stopwords", quiet=True)
    stop_words = set(stopwords.words("arabic"))
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

In [15]:
# !pip install qalsadi

In [16]:
from nltk.stem.isri import ISRIStemmer

def stem_tokens(tokens):
    stemmer = ISRIStemmer()
    return [stemmer.stem(w) for w in tokens]

In [17]:
def arabic_preprocess(text, translate=True, use_stemming=True):
    # if not isinstance(text, str):
    #     return ""

    # if translate:
    #     text = translate_if_not_arabic(text)

    text = normalize_franco(text)
    text = normalize_arabic(text)
    text = clean_text(text)

    tokens = text.split()
    tokens = remove_stopwords(tokens)

    if use_stemming:
        tokens = stem_tokens(tokens)

    return tokens

In [18]:
df["review_description"] = df["review_description"].astype(str).apply(arabic_preprocess)
df.head()

Unnamed: 0.1,Unnamed: 0,review_description,rating,company
0,0,[رءع],1,talbat
1,1,"[رنمج, رءع, جدا, سعد, علي, لبه, حيج, شكل, سرع]",1,talbat
2,2,"[طبق, غتح, داء, يعط, وجد, تصل, شبك, انه, الن, ...",-1,talbat
3,3,"[لمذ, يمك, طلب, ماكدونالدز؟]",-1,talbat
4,4,"[رنمج, ظهر, طعم, غلق, انه, بتك, فتح, بقل, كده,...",-1,talbat


In [19]:
# !pip install scipy

In [20]:
# !pip install gensim

In [21]:
# import gensim.downloader as api

# model = api.load("word2vec-google-news-300")

In [22]:
# from gensim.models import Word2Vec
# w2v_model = Word2Vec(sentences=df["review_description"], vector_size=300, window=5, min_count=2)


In [23]:
def identity(x):
    return x

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    tokenizer=identity,
    preprocessor=identity,
    token_pattern=None,
    ngram_range=(1, 2),
    max_features=10000  # reduce overfitting
)

# Fit on your tokenized reviews
X_tfidf = tfidf.fit_transform(df["review_description"])

print("TF-IDF shape:", X_tfidf.shape)

TF-IDF shape: (36223, 10000)


In [24]:
from sklearn.model_selection import train_test_split

X = X_tfidf
y = df["rating"].replace(-1, 0).values
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((28978, 10000), (7245, 10000), (28978,), (7245,))

In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

NN = keras.Sequential([
    keras.layers.Dense(32, input_shape=(X_train.shape[1],), activation='relu',kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    keras.layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

optimizer = keras.optimizers.Adam(learning_rate=0.01)
NN.compile(optimizer=optimizer,
           loss='binary_crossentropy',
           metrics=['accuracy'])

history = NN.fit(X_train, y_train, epochs=10, batch_size=32,
                 validation_data=(X_test, y_test))

Epoch 1/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.7781 - loss: 0.5956 - val_accuracy: 0.8443 - val_loss: 0.5127
Epoch 2/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8610 - loss: 0.4975 - val_accuracy: 0.8446 - val_loss: 0.4918
Epoch 3/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.8606 - loss: 0.4868 - val_accuracy: 0.8522 - val_loss: 0.4883
Epoch 4/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.8657 - loss: 0.4649 - val_accuracy: 0.8511 - val_loss: 0.4826
Epoch 5/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8654 - loss: 0.4697 - val_accuracy: 0.8392 - val_loss: 0.4902
Epoch 6/10
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.8646 - loss: 0.4586 - val_accuracy: 0.8613 - val_loss: 0.4620
Epoch 7/10
[1m906

In [26]:
from sklearn.metrics import classification_report

y_pred_NN = NN.predict(X_test)

train_loss, train_accuracy = NN.evaluate(X_train, y_train, verbose=0)
test_loss, test_accuracy = NN.evaluate(X_test, y_test, verbose=0)

print(f'Train Accuracy: {train_accuracy:.4f},\nTrain Loss: {train_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f},\nTest Loss: {test_loss:.4f}')

y_pred = (y_pred_NN > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Train Accuracy: 0.8828,
Train Loss: 0.4120
Test Accuracy: 0.8581,
Test Loss: 0.4526
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      2772
           1       0.90      0.87      0.88      4473

    accuracy                           0.86      7245
   macro avg       0.85      0.86      0.85      7245
weighted avg       0.86      0.86      0.86      7245



In [27]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

NN.save("arabic_sentiment_model.h5")




In [28]:
import pickle
import numpy as np
from tensorflow import keras

# Load saved TF-IDF and ANN model
with open("tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

model = keras.models.load_model("arabic_sentiment_model.h5")

def predict_sentiment(review):
    tokens = arabic_preprocess(review)

    X_tfidf_review = tfidf.transform([tokens])

    # ANN expects dense array
    X_input = X_tfidf_review.toarray()

    # Predict probability
    prob = model.predict(X_input)[0][0]

    # Threshold 0.5 for binary classification
    predicted_class = 1 if prob >= 0.5 else 0

    mapping = {0: -1, 1: 1}
    return mapping[predicted_class]




In [29]:
# !pip install gradio

In [30]:
import gradio as gr

interface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=5, placeholder="أدخل تقييمك هنا ..."),
    outputs="text",
    title="Arabic Sentiment Analysis",
    description="أدخل مراجعة باللغة العربية ليقوم النموذج بتصنيفها على أنها سلبية (-1)، أو إيجابية (1)."
)

interface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://88147cdf0ec53aa0b7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


