In [1]:
import pandas as pd              # Import Pandas for data loading and handling tables (DataFrames)
import numpy as np               # Import NumPy for numerical operations and arrays

from sklearn.model_selection import train_test_split,GridSearchCV  # To split the dataset into training and testing sets
from sklearn.pipeline import Pipeline                  # To create a machine learning pipeline (steps in one line)
from sklearn.compose import ColumnTransformer          # To apply different preprocessing to different columns
from sklearn.preprocessing import StandardScaler       # To scale numerical features (mean=0, std=1)
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text into numerical features using TF-IDF
from sklearn.linear_model import LogisticRegression    # Logistic Regression model for classification
from sklearn.metrics import accuracy_score, classification_report  # To evaluate the model performance

Data Collection and Data Cleaning (Preprocessing)

In [4]:
df = pd.read_csv("Restaurant reviews.csv")  
# Load the dataset from a CSV file into a DataFrame

df = df[["Restaurant", "Review", "Rating"]].copy()  
# Keep only the important columns (Restaurant, Review, Rating)

df.dropna(subset=["Restaurant", "Review", "Rating"], inplace=True)  
# Remove rows that have missing values in these columns

df["Rating"] = df["Rating"].astype(str).str.strip()  
# Convert Rating to string and remove any extra spaces

df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")  
# Convert Rating to numeric values, and force invalid values to NaN

df.dropna(subset=["Rating"], inplace=True)  
# Remove rows where Rating could not be converted to a number

# (اختياري لكن مفيد) نتأكد أن الرِيتينغ بين 1 و 5
df = df[(df["Rating"] >= 1) & (df["Rating"] <= 5)]

print("Data Shape after cleaning:", df.shape)

Target Creation (Label Engineering)

In [22]:
# =================================================
# 2. CREATE SENTIMENT LABEL (BINARY)
# =================================================
# 0 = Negative (1 أو 2)
# 1 = Positive (4 أو 5)
# نحذف Rating = 3 لأنه محايد ويخبّص الدقة

df = df[df["Rating"] != 3]

df["Sentiment"] = np.where(df["Rating"] >= 4, 1, 0)

print(df["Sentiment"].value_counts())
print("Unique Sentiments:", df["Sentiment"].unique())

Sentiment
1    6268
0    2494
Name: count, dtype: int64
Unique Sentiments: [1 0]


In [23]:
# =================================================
# 3. TRAIN–TEST SPLIT (TEXT ONLY FOR NOW)
# =================================================

X_text = df["Review"]          # النص
y = df["Sentiment"]            # 0 / 1

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train_text.shape[0])
print("Test size:", X_test_text.shape[0])

Train size: 7009
Test size: 1753


In [25]:
# =================================================
# 4. BASELINE MODEL (TF-IDF + NAIVE BAYES)
# =================================================

baseline_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),     # كلمات + bigrams مثل "not good"
        max_features=5000
    )),
    ("model", MultinomialNB(alpha=1.0))
])

# Train the baseline model
baseline_pipeline.fit(X_train_text, y_train)

# Predict on test set
y_pred_base = baseline_pipeline.predict(X_test_text)

# Accuracy
acc_base = accuracy_score(y_test, y_pred_base)

print("\n===== BASELINE MODEL (Naive Bayes) =====")
print("Accuracy:", acc_base)
print("\nClassification Report (Baseline):")
print(classification_report(y_test, y_pred_base))


===== BASELINE MODEL (Naive Bayes) =====
Accuracy: 0.91500285225328

Classification Report (Baseline):
              precision    recall  f1-score   support

           0       0.94      0.75      0.83       499
           1       0.91      0.98      0.94      1254

    accuracy                           0.92      1753
   macro avg       0.92      0.87      0.89      1753
weighted avg       0.92      0.92      0.91      1753



In [27]:
from sklearn.preprocessing import MinMaxScaler  # أضف هذا في قسم الـ imports فوق


# =================================================
# 5. FEATURE ENGINEERING + TF-IDF + NAIVE BAYES
# =================================================

# نرجّع X_train و X_test كـ DataFrame عشان نضيف أعمدة جديدة
X_train_fe = pd.DataFrame({"Review": X_train_text})
X_test_fe  = pd.DataFrame({"Review": X_test_text})

# ---- Feature Engineering ----
for df_fe in [X_train_fe, X_test_fe]:
    # طول الريفيو (عدد الحروف)
    df_fe["review_length"] = df_fe["Review"].astype(str).apply(len)

    # عدد الكلمات
    df_fe["word_count"] = df_fe["Review"].astype(str).apply(
        lambda x: len(x.split())
    )

    # عدد علامات التعجب !
    df_fe["exclamation_count"] = df_fe["Review"].astype(str).apply(
        lambda x: x.count("!")
    )

    # نسبة الحروف الكبيرة (CAPS)
    df_fe["capital_ratio"] = df_fe["Review"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
    )

num_features = ["review_length", "word_count", "exclamation_count", "capital_ratio"]

# ColumnTransformer: TF-IDF للنص + تطبيع الفيتشرز الرقمية إلى [0, 1]
fe_preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=5000
        ), "Review"),
        ("num", MinMaxScaler(), num_features)   # ✅ هنا بدلنا passthrough بـ MinMaxScaler
    ]
)

fe_pipeline = Pipeline([
    ("features", fe_preprocessor),
    ("model", MultinomialNB(alpha=1.0))
])

# Train FE model
fe_pipeline.fit(X_train_fe, y_train)

# Predict with FE model
y_pred_fe = fe_pipeline.predict(X_test_fe)

# Accuracy for FE model
acc_fe = accuracy_score(y_test, y_pred_fe)

print("\n===== TF-IDF + FEATURE ENGINEERING (Naive Bayes + Scaled Numeric) =====")
print("Accuracy:", acc_fe)
print("\nClassification Report (FE):")
print(classification_report(y_test, y_pred_fe))



===== TF-IDF + FEATURE ENGINEERING (Naive Bayes + Scaled Numeric) =====
Accuracy: 0.91500285225328

Classification Report (FE):
              precision    recall  f1-score   support

           0       0.94      0.75      0.83       499
           1       0.91      0.98      0.94      1254

    accuracy                           0.92      1753
   macro avg       0.92      0.87      0.89      1753
weighted avg       0.92      0.92      0.91      1753



Feature Engineering Model

In [None]:
# حساب عدد التقييمات الإيجابية لكل مطعم
positive_counts = df[df["Sentiment"] == 1]["Restaurant"].value_counts()

# استخراج أفضل 5 مطاعم
top_5_positive = positive_counts.head(5)

print("Top 5 Restaurants by Positive Reviews:")
print(top_5_positive)

In [None]:
import matplotlib.pyplot as plt

# حساب عدد التقييمات الإيجابية لكل مطعم
positive_counts = df[df["Sentiment"] == 1]["Restaurant"].value_counts()

# اختيار أفضل 5 فقط
top_5_positive = positive_counts.head(5)

# ألوان جميلة (Pastel)
colors = ["#66b3ff", "#99ff99", "#ffcc99", "#ff9999", "#c2c2f0"]

# تحديد القطعة الأكبر (Explode)
explode = [0.05] + [0]*4   # أول قطعة فقط تخرج للخارج

# الرسم
plt.figure(figsize=(9,9))
plt.pie(
    top_5_positive.values,
    labels=top_5_positive.index,
    autopct="%1.1f%%",
    startangle=140,
    colors=colors,
    explode=explode,
    shadow=True,
    wedgeprops={"edgecolor":"black", "linewidth":1}
)

plt.title("Top 5 Restaurants by Positive Reviews", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
# حساب عدد التقييمات السلبية لكل مطعم
negative_counts = df[df["Sentiment"] == 0]["Restaurant"].value_counts()

# اختيار أسوأ 5 فقط
worst_5_negative = negative_counts.head(5)

print(" Worst 5 Restaurants by Negative Reviews:")
print(worst_5_negative)

In [None]:
import matplotlib.pyplot as plt

# ألوان مناسبة للسلبيات
colors = ["#ff4d4d", "#ff9999", "#ff6666", "#ffb3b3", "#ff8080"]

# إبراز أسوأ مطعم (أكثرهم سلبيات)
explode = [0.05] + [0]*4

plt.figure(figsize=(9,9))
plt.pie(
    worst_5_negative.values,
    labels=worst_5_negative.index,
    autopct="%1.1f%%",
    startangle=140,
    colors=colors,
    explode=explode,
    shadow=True,
    wedgeprops={"edgecolor":"black", "linewidth":1}
)

plt.title("Worst 5 Restaurants by Negative Reviews", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.show()