In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
df = pd.read_excel("CodeCraft ML Competition Dataset.xlsx", sheet_name='Sheet1')


In [4]:
df

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4,30 minutes
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4,30 minutes
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30,65 minutes
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95,30 minutes
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235,65 minutes
...,...,...,...,...,...,...,...,...,...
11089,ID_8067,"BTM Layout 1, Electronic City","Tibetan, Chinese, Continental, Momos",₹250,₹50,4.2,326,189,30 minutes
11090,ID_4620,"Sector 14, Noida",Fast Food,₹100,₹50,3.6,36,16,30 minutes
11091,ID_3392,Majestic,"South Indian, Chinese, North Indian",₹100,₹50,3.5,45,18,30 minutes
11092,ID_4115,"Sector 3, Marathalli",North Indian,₹100,₹50,3.1,24,9,30 minutes


In [14]:
df.info

<bound method DataFrame.info of       Restaurant                             Location  \
0        ID_6321  FTI College, Law College Road, Pune   
1        ID_2882                 Sector 3, Marathalli   
2        ID_1595                       Mumbai Central   
3        ID_5929                      Sector 1, Noida   
4        ID_6123   Rmz Centennial, I Gate, Whitefield   
...          ...                                  ...   
11088     ID_944   Rmz Centennial, I Gate, Whitefield   
11089    ID_8067        BTM Layout 1, Electronic City   
11090    ID_4620                     Sector 14, Noida   
11091    ID_3392                             Majestic   
11092    ID_4115                 Sector 3, Marathalli   

                                     Cuisines  Average_Cost  Minimum_Order  \
0      Fast Food, Rolls, Burger, Salad, Wraps         200.0           50.0   
1                         Ice Cream, Desserts         100.0           50.0   
2             Italian, Street Food, Fast Food    

In [16]:
df.shape

(8782, 9)

In [9]:
df.head()


Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4,30 minutes
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4,30 minutes
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30,65 minutes
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95,30 minutes
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235,65 minutes


In [10]:
df = df[df['Average_Cost'].str.contains(r'^\₹?\d+', na=False)]
df = df[df['Minimum_Order'].str.contains(r'^\₹?\d+', na=False)]

In [11]:
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Average_Cost'] = df['Average_Cost'].str.replace('₹', '').str.replace(',', '').astype(float)
df['Minimum_Order'] = df['Minimum_Order'].str.replace('₹', '').str.replace(',', '').astype(float)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Delivery_Time'] = df['Delivery_Time'].str.extract('(\d+)').astype(float)

  df['Delivery_Time'] = df['Delivery_Time'].str.extract('(\d+)').astype(float)


In [12]:
df.dropna(subset=[
    'Location', 'Cuisines', 'Average_Cost', 'Minimum_Order', 'Votes',
    'Reviews', 'Delivery_Time', 'Rating'
], inplace=True)

In [13]:
df_selected = df[['Location', 'Cuisines', 'Average_Cost', 'Minimum_Order', 
                  'Votes', 'Reviews', 'Delivery_Time', 'Rating']]

In [17]:
df_encoded = pd.get_dummies(df_selected, columns=['Location', 'Cuisines'])

In [18]:
X = df_encoded.drop(columns='Rating')
y = df_encoded['Rating']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

In [21]:
grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, 
                    cv=3, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

In [22]:
# Evaluate best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

In [23]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)



In [24]:
print("✅ Best Hyperparameters:", grid.best_params_)
print("📉 RMSE:", rmse)
print("📈 R² Score:", r2)

✅ Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
📉 RMSE: 0.25276958672388217
📈 R² Score: 0.6329133761726309
