In [1]:
# Task 1: Predict Restaurant Ratings
# Internship at Cognifyz Technologies - Machine Learning Intern

# Objective: Predict the aggregate rating of a restaurant using features such as votes, cost, cuisine, etc.

# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Step 2: Load dataset
df = pd.read_csv("Dataset .csv")
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3.0,4.8,Dark Green,Excellent,314.0
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3.0,4.5,Dark Green,Excellent,591.0
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4.0,4.4,Green,Very Good,270.0
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4.0,4.9,Dark Green,Excellent,365.0
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4.0,4.8,Dark Green,Excellent,229.0


In [3]:
# Step 3: Drop irrelevant columns
df_clean = df.drop(columns=[
    'Restaurant ID', 'Restaurant Name', 'Address', 'Locality',
    'Locality Verbose', 'Longitude', 'Latitude',
    'Rating color', 'Rating text'
])

In [4]:
# Step 4: Handle missing values
df_clean = df_clean[df_clean['Aggregate rating'].notna()]
df_clean['Cuisines'] = df_clean['Cuisines'].fillna('Unknown')

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4410 entries, 0 to 4409
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country Code          4410 non-null   int64  
 1   City                  4410 non-null   object 
 2   Cuisines              4410 non-null   object 
 3   Average Cost for two  4410 non-null   float64
 4   Currency              4410 non-null   object 
 5   Has Table booking     4410 non-null   object 
 6   Has Online delivery   4410 non-null   object 
 7   Is delivering now     4410 non-null   object 
 8   Switch to order menu  4410 non-null   object 
 9   Price range           4410 non-null   float64
 10  Aggregate rating      4410 non-null   float64
 11  Votes                 4410 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 447.9+ KB


In [5]:
# Step 5: Define features and target
categorical_cols = [
    'Country Code', 'City', 'Cuisines', 'Currency',
    'Has Table booking', 'Has Online delivery', 'Is delivering now',
    'Switch to order menu'
]
numerical_cols = ['Average Cost for two', 'Price range', 'Votes']

X = df_clean.drop(columns=['Aggregate rating'])
y = df_clean['Aggregate rating']

In [6]:
# Step 6: Preprocessing pipelines
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [11]:
# Step 7: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Step 8: Build and train model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

model.fit(X_train, y_train)

In [9]:
# Step 9: Model evaluation
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.0991
R² Score: 0.9490


In [10]:
# Step 10: Feature importance analysis
rf_model = model.named_steps['regressor']
ohe = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']

encoded_cat_features = ohe.get_feature_names_out(categorical_cols)
all_features = np.concatenate([numerical_cols, encoded_cat_features])
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': all_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
2,Votes,0.933664
0,Average Cost for two,0.008306
79,City_New Delhi,0.003437
3,Country Code_1,0.002968
49,City_Gurgaon,0.00245
1134,Currency_Indian Rupees(Rs.),0.002226
5,Country Code_30,0.001799
1,Price range,0.001671
1131,Currency_Brazilian Real(R$),0.001512
866,"Cuisines_North Indian, Chinese",0.00126
