In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Dataset .csv')

# Handle missing values: drop rows with any missing values
df_clean = df.dropna()

# Show the number of rows before and after cleaning
print("Original shape:", df.shape)
print("After dropping missing values:", df_clean.shape)

# Display the first few rows of the cleaned data
df_clean.head()

Original shape: (9551, 21)
After dropping missing values: (9542, 21)


Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [2]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables using LabelEncoder
categorical_cols = df_clean.select_dtypes(include='object').columns
le = LabelEncoder()
for col in categorical_cols:
    df_clean.loc[:, col] = le.fit_transform(df_clean[col].astype(str))

# Display the first few rows of the encoded data
df_clean.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,3742,162,73,8677,171,172,121.027535,14.565443,920,...,0,1,0,0,0,3,4.8,0,1,314
1,6304287,3167,162,73,6047,592,600,121.014101,14.553708,1111,...,0,1,0,0,0,3,4.5,0,1,591
2,6300002,2892,162,75,4676,308,314,121.056831,14.581404,1671,...,0,1,0,0,0,4,4.4,1,5,270
3,6318506,4700,162,75,8682,860,873,121.056475,14.585318,1126,...,0,0,0,0,0,4,4.9,0,1,365
4,6314302,5515,162,75,8681,860,873,121.057508,14.58445,1122,...,0,1,0,0,0,4,4.8,0,1,229


In [3]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_clean.drop('Aggregate rating', axis=1)
y = df_clean['Aggregate rating']

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shape of the splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (7633, 20)
Testing set shape: (1909, 20)


In [4]:
from sklearn.ensemble import RandomForestClassifier

# For classification, convert ratings to integer classes if needed
y_train_class = y_train.round().astype(int)
y_test_class = y_test.round().astype(int)

# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train_class)

# Predict on the test set
y_pred = clf.predict(X_test)

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Evaluate the model's performance
accuracy = accuracy_score(y_test_class, y_pred)
precision = precision_score(y_test_class, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test_class, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("\nClassification Report:\n", classification_report(y_test_class, y_pred, zero_division=0))

Accuracy: 0.98
Precision: 0.98
Recall: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       431
           2       1.00      0.64      0.78        61
           3       0.97      1.00      0.99       732
           4       0.99      0.98      0.98       646
           5       0.70      0.82      0.75        39

    accuracy                           0.98      1909
   macro avg       0.93      0.89      0.90      1909
weighted avg       0.98      0.98      0.98      1909



In [6]:
import numpy as np

# Add cuisine labels back to test set for analysis
# (Assumes 'Cuisines' was encoded; get original mapping)
# If you have the original df, you can use it directly:
X_test_with_cuisine = X_test.copy()
X_test_with_cuisine['Cuisines'] = df.loc[X_test.index, 'Cuisines']
X_test_with_cuisine['True_Rating'] = y_test_class
X_test_with_cuisine['Predicted_Rating'] = y_pred

# Analyze accuracy per cuisine
cuisine_groups = X_test_with_cuisine.groupby('Cuisines')
cuisine_accuracy = {}

for cuisine, group in cuisine_groups:
    acc = np.mean(group['True_Rating'] == group['Predicted_Rating'])
    cuisine_accuracy[cuisine] = acc

# Show top and bottom cuisines by accuracy
sorted_acc = sorted(cuisine_accuracy.items(), key=lambda x: x[1], reverse=True)
print("Cuisine-wise accuracy (top 5):")
for cuisine, acc in sorted_acc[:5]:
    print(f"{cuisine}: {acc:.2f}")

print("\nCuisine-wise accuracy (bottom 5):")
for cuisine, acc in sorted_acc[-5:]:
    print(f"{cuisine}: {acc:.2f}")

# Identify challenges or biases
print("\nAnalysis:")
print("- Cuisines with fewer samples may have lower accuracy due to insufficient data.")
print("- If certain cuisines always get the same predicted rating, the model may be biased or underfitting.")
print("- Check class distribution per cuisine for imbalance.")

Cuisine-wise accuracy (top 5):
Afghani, Mughlai, Chinese: 1.00
American: 1.00
American, BBQ, Steak: 1.00
American, Bar Food: 1.00
American, Breakfast: 1.00

Cuisine-wise accuracy (bottom 5):
Kebab, Turkish Pizza: 0.00
North Indian, Chinese, Mexican, Lebanese, Italian, Fast Food: 0.00
North Indian, Chinese, Street Food, Fast Food: 0.00
North Indian, Street Food, Chinese, South Indian: 0.00
Thai, European, Mexican, North Indian, Chinese, Cafe: 0.00

Analysis:
- Cuisines with fewer samples may have lower accuracy due to insufficient data.
- If certain cuisines always get the same predicted rating, the model may be biased or underfitting.
- Check class distribution per cuisine for imbalance.
