In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB      # ✅ مهم
from sklearn.preprocessing import MinMaxScaler      # ✅ لو تستخدمينه في FE
from sklearn.metrics import accuracy_score, classification_report

In [7]:
df = pd.read_csv("Restaurant reviews.csv")  
# Load the dataset from a CSV file into a DataFrame

df = df[["Restaurant", "Review", "Rating"]].copy()  
# Keep only the important columns (Restaurant, Review, Rating)

df.dropna(subset=["Restaurant", "Review", "Rating"], inplace=True)  
# Remove rows that have missing values in these columns

df["Rating"] = df["Rating"].astype(str).str.strip()  
# Convert Rating to string and remove any extra spaces

df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")  
# Convert Rating to numeric values, and force invalid values to NaN

df.dropna(subset=["Rating"], inplace=True)  
# Remove rows where Rating could not be converted to a number

# (اختياري لكن مفيد) نتأكد أن الرِيتينغ بين 1 و 5
df = df[(df["Rating"] >= 1) & (df["Rating"] <= 5)]

print("Data Shape after cleaning:", df.shape)

Data Shape after cleaning: (9954, 3)


In [8]:
# =================================================
# 2. CREATE SENTIMENT LABEL (BINARY)
# =================================================
# 0 = Negative (1 أو 2)
# 1 = Positive (4 أو 5)
# نحذف Rating = 3 لأنه محايد ويخبّص الدقة

df = df[df["Rating"] != 3]

df["Sentiment"] = np.where(df["Rating"] >= 4, 1, 0)

print(df["Sentiment"].value_counts())
print("Unique Sentiments:", df["Sentiment"].unique())

Sentiment
1    6268
0    2494
Name: count, dtype: int64
Unique Sentiments: [1 0]


In [9]:
# =================================================
# 3. TRAIN–TEST SPLIT (TEXT ONLY FOR NOW)
# =================================================

X_text = df["Review"]          # النص
y = df["Sentiment"]            # 0 / 1

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train_text.shape[0])
print("Test size:", X_test_text.shape[0])

Train size: 7009
Test size: 1753


In [10]:
# =================================================
# 4. BASELINE MODEL (TF-IDF + NAIVE BAYES)
# =================================================

baseline_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),     # كلمات + bigrams مثل "not good"
        max_features=5000
    )),
    ("model", MultinomialNB(alpha=1.0))
])

# Train the baseline model
baseline_pipeline.fit(X_train_text, y_train)

# Predict on test set
y_pred_base = baseline_pipeline.predict(X_test_text)

# Accuracy
acc_base = accuracy_score(y_test, y_pred_base)

print("\n===== BASELINE MODEL (Naive Bayes) =====")
print("Accuracy:", acc_base)
print("\nClassification Report (Baseline):")
print(classification_report(y_test, y_pred_base))


===== BASELINE MODEL (Naive Bayes) =====
Accuracy: 0.91500285225328

Classification Report (Baseline):
              precision    recall  f1-score   support

           0       0.94      0.75      0.83       499
           1       0.91      0.98      0.94      1254

    accuracy                           0.92      1753
   macro avg       0.92      0.87      0.89      1753
weighted avg       0.92      0.92      0.91      1753



In [11]:
# =================================================
# 5. IMPROVEMENT: TF-IDF + Logistic Regression + GridSearch (L1 / L2)
# =================================================

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Pipeline: TF-IDF → Logistic Regression
logreg_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_features=5000
    )),
    ("model", LogisticRegression(
        max_iter=5000,
        solver="liblinear"   # يدعم L1 و L2
    ))
])

# GridSearch على نوع الـ penalty و قيمة C
param_grid = {
    "model__penalty": ["l1", "l2"],         # L1 vs L2
    "model__C": [0.01, 0.1, 1, 10]          # قوة الانتظام (regularization)
}

grid = GridSearchCV(
    estimator=logreg_pipeline,
    param_grid=param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

# Train with GridSearch
grid.fit(X_train_text, y_train)

print("\n===== GRIDSEARCH RESULTS (Logistic Regression) =====")
print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

# Use best model on test set
best_logreg_model = grid.best_estimator_

y_pred_grid = best_logreg_model.predict(X_test_text)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("\n===== BEST LOGISTIC MODEL ON TEST SET =====")
print("Accuracy:", acc_grid)
print("\nClassification Report (Best Logistic):")
print(classification_report(y_test, y_pred_grid))

Fitting 5 folds for each of 8 candidates, totalling 40 fits

===== GRIDSEARCH RESULTS (Logistic Regression) =====
Best Params: {'model__C': 10, 'model__penalty': 'l2'}
Best CV Score: 0.9279498748092101

===== BEST LOGISTIC MODEL ON TEST SET =====
Accuracy: 0.9235596120935539

Classification Report (Best Logistic):
              precision    recall  f1-score   support

           0       0.90      0.83      0.86       499
           1       0.93      0.96      0.95      1254

    accuracy                           0.92      1753
   macro avg       0.91      0.89      0.90      1753
weighted avg       0.92      0.92      0.92      1753

