In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
df = pd.read_csv("Restaurant reviews.csv")
df

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,
...,...,...,...,...,...,...,...,...
9995,Chinese Pavilion,Abhishek Mahajan,Madhumathi Mahajan Well to start with nice cou...,3,"53 Reviews , 54 Followers",6/5/2016 0:08,0,
9996,Chinese Pavilion,Sharad Agrawal,This place has never disappointed us.. The foo...,4.5,"2 Reviews , 53 Followers",6/4/2016 22:01,0,
9997,Chinese Pavilion,Ramandeep,"Bad rating is mainly because of ""Chicken Bone ...",1.5,"65 Reviews , 423 Followers",6/3/2016 10:37,3,
9998,Chinese Pavilion,Nayana Shanbhag,I personally love and prefer Chinese Food. Had...,4,"13 Reviews , 144 Followers",5/31/2016 17:22,0,


In [12]:
df.isnull().sum()

Restaurant       0
Reviewer        38
Review          45
Rating          38
Metadata        38
Time            38
Pictures         0
7514          9999
dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Restaurant  10000 non-null  object 
 1   Reviewer    9962 non-null   object 
 2   Review      9955 non-null   object 
 3   Rating      9962 non-null   object 
 4   Metadata    9962 non-null   object 
 5   Time        9962 non-null   object 
 6   Pictures    10000 non-null  int64  
 7   7514        1 non-null      float64
dtypes: float64(1), int64(1), object(6)
memory usage: 625.1+ KB


In [14]:
# الاحتفاظ فقط بالأعمدة المهمة
df = df[["Restaurant","Review", "Rating"]]

# حذف القيم الناقصة
df = df.dropna(subset=["Restaurant","Review", "Rating"])

# تنظيف النص من الفراغات
df["Rating"] = df["Rating"].astype(str).str.strip()

In [15]:
df.columns

Index(['Restaurant', 'Review', 'Rating'], dtype='object')

In [16]:
# ================================
# 4️⃣ إنشاء تصنيف (Sentiment) من النص
# ================================

def create_sentiment(review):
    positive_words = ["good", "great", "excellent", "amazing", "love", "nice", "perfect", "best"]
    negative_words = ["bad", "terrible", "awful", "poor", "hate", "worst", "dirty"]

    review = review.lower()

    pos_count = sum(word in review for word in positive_words)
    neg_count = sum(word in review for word in negative_words)

    if pos_count >= neg_count:
        return 1   # Positive
    else:
        return 0   # Negative

# إنشاء عمود التصنيف الجديد
df["Sentiment"] = df["Review"].apply(create_sentiment)

# التأكد من عدد كل فئة
print(df["Sentiment"].value_counts())


Sentiment
1    9107
0     848
Name: count, dtype: int64


In [23]:
X = df["Review"]     # Features 
y = df["Sentiment"] #Target

In [24]:
#split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [25]:
# إنشاء Pipeline تجمع بين معالجة النص + المودل
pipeline = Pipeline([
    
    # الخطوة الأولى: تحويل النص إلى أرقام باستخدام TF-IDF
    ("tfidf", TfidfVectorizer(stop_words="english")),
    
    # الخطوة الثانية: مودل Logistic Regression للتصنيف
    ("model", LogisticRegression(max_iter=1000))
])

In [26]:
pipeline.fit(X_train, y_train)

In [27]:
#predict
y_pred = pipeline.predict(X_test)

In [138]:
#eva
print("Accuracy:", accuracy_score(y_test, y_pred)) 
print(classification_report(y_test, y_pred))

Accuracy: 0.9472626820693119
              precision    recall  f1-score   support

           0       0.91      0.47      0.62       182
           1       0.95      1.00      0.97      1809

    accuracy                           0.95      1991
   macro avg       0.93      0.73      0.79      1991
weighted avg       0.95      0.95      0.94      1991



In [28]:
# test new predictions

new_reviews = [
    "The food was amazing and the service was great",
    "The place was dirty and the food was bad"
]

predictions = pipeline.predict(new_reviews)

for review, pred in zip(new_reviews, predictions):
    print(review, "=>", "Positive" if pred == 1 else "Negative")

The food was amazing and the service was great => Positive
The place was dirty and the food was bad => Negative


In [29]:
df.isnull().sum()

Restaurant    0
Review        0
Rating        0
Sentiment     0
dtype: int64

In [30]:
# ================================
# ✅ FEATURE ENGINEERING
# ================================

df["review_length"] = df["Review"].apply(len)
df["word_count"] = df["Review"].apply(lambda x: len(x.split()))
df["exclamation_count"] = df["Review"].apply(lambda x: x.count("!"))

# نسبة الأحرف الكبيرة
df["capital_ratio"] = df["Review"].apply(
    lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
)

print(df.head())


        Restaurant                                             Review Rating  \
0  Beyond Flavours  The ambience was good, food was quite good . h...      5   
1  Beyond Flavours  Ambience is too good for a pleasant evening. S...      5   
2  Beyond Flavours  A must try.. great food great ambience. Thnx f...      5   
3  Beyond Flavours  Soumen das and Arun was a great guy. Only beca...      5   
4  Beyond Flavours  Food is good.we ordered Kodi drumsticks and ba...      5   

   Sentiment  review_length  word_count  exclamation_count  capital_ratio  
0          1            222          41                  0       0.031532  
1          1            144          27                  0       0.041667  
2          1            189          31                  0       0.047619  
3          1            148          28                  0       0.033784  
4          1            160          27                  0       0.050000  


In [31]:
X = df[["Review", "review_length", "word_count", "exclamation_count", "capital_ratio"]]
y = df["Sentiment"]

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [33]:
# ================================
# ✅ PIPELINE مع FEATURE ENGINEERING
# ================================

preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(stop_words="english"), "Review"),
        ("num", StandardScaler(), ["review_length", "word_count", "exclamation_count", "capital_ratio"])
    ]
)

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])
