# **Task: Predictive Modeling**

### **Build a regression model to predict the aggregate rating of a restaurant based on available features.**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from google.colab import files
uploaded = files.upload()

# Upload the csv file
uploaded = files.upload()

# Get the correct filename
filename = list(uploaded.keys())[0]

# Read the CSV file
df = pd.read_csv(filename)

# Check if the dataset is loaded correctly
print(df.head())

Saving Dataset .csv to Dataset .csv


Saving Dataset .csv to Dataset  (1).csv
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La,

**Drop unnecessary columns**

In [2]:
columns_to_drop = ['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Rating color', 'Rating text', 'Currency', 'Switch to order menu']
df = df.drop(columns=columns_to_drop)

**Convert categorical variables to numerical**

**Convert Yes/No categorical features to 1/0**

In [3]:
df["Has Table booking"] = df["Has Table booking"].map({"Yes": 1, "No" : 0})
df["Has Online delivery"] = df["Has Online delivery"].map({"Yes": 1, "No": 0})
df["Is delivering now"] = df["Is delivering now"].map({"Yes": 1, "No": 0})

**Handle missing values**

In [7]:
# Check for missing Values and drop NaN
df = df.dropna()

**Select features and target variable**

In [8]:
x = df.drop(columns=['Aggregate rating'])   # Features
y = df['Aggregate rating']            # Target

**Split data into training and testing sets**

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

**Encode Categorical Data**

In [10]:
print(x_train.dtypes)

Country Code              int64
City                     object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Has Table booking         int64
Has Online delivery       int64
Is delivering now         int64
Price range               int64
Votes                     int64
dtype: object


In [11]:
x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

In [12]:
x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

**Train a Linear Regression Model**

In [13]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

**Make Predictions & Evaluate the Model**

In [14]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_lr = lr_model.predict(x_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - MSE: {mse_lr}, R2 Score: {r2_lr}")

Linear Regression - MSE: 1.910967390374182, R2 Score: 0.16552815745407523


**Train a Random Forest model**

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

**Make Predictions and Evaluate the Random Forest model**

In [19]:
y_pred_rf = rf_model.predict(x_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MSE: {mse_rf}, R2_score:{r2_rf}")

Random Forest - MSE: 0.09075449345206917, R2_score:0.9603697740988446


### **Split the dataset into training and testing sets and evaluate the model's performance using appropriate metrics.**

In [20]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

# Upload the csv file
uploaded = files.upload()

# Get the correct filename
filename = list(uploaded.keys())[0]

# Read the CSV file
df = pd.read_csv(filename)

# Check if the dataset is loaded correctly
print(df.head())

Saving Dataset .csv to Dataset  (2).csv


Saving Dataset .csv to Dataset  (3).csv
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La,

**Drop Unnecessary Columns**

In [21]:
columns_to_drop = ['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Rating color', 'Rating text', 'Currency', 'Switch to order menu']
df = df.drop(columns=columns_to_drop)

**Convert Categorical features to numerical**

**Convert Yes/No categorical features to 1/0**

In [22]:
df["Has Table booking"] = df["Has Table booking"].map({"Yes": 1, "No" : 0})
df["Has Online delivery"] = df["Has Online delivery"].map({"Yes": 1, "No": 0})
df["Is delivering now"] = df["Is delivering now"].map({"Yes": 1, "No": 0})

**Handle Missing Values**

In [23]:
df = df.dropna()

**Select Features (x) and Target Variable (y)**

In [24]:
x = df.drop(columns=['Aggregate rating'])   # Features
y = df['Aggregate rating']            # Target

**Split Data into Training and Testing Sets**

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

print("Dataset split successfully!")

Dataset split successfully!


**Convert Categorical Variables to Numeric**

In [26]:
print(x.select_dtypes(include=['object']).head())

               City                          Cuisines
0       Makati City        French, Japanese, Desserts
1       Makati City                          Japanese
2  Mandaluyong City  Seafood, Asian, Filipino, Indian
3  Mandaluyong City                   Japanese, Sushi
4  Mandaluyong City                  Japanese, Korean


In [27]:
x = pd.get_dummies(x, drop_first=True)   # Convert categorical columns into numeric

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [29]:
print(x_train.dtypes)

Country Code                                  int64
Longitude                                   float64
Latitude                                    float64
Average Cost for two                          int64
Has Table booking                             int64
                                             ...   
Cuisines_Western, Asian, Cafe                  bool
Cuisines_Western, Fusion, Fast Food            bool
Cuisines_World Cuisine                         bool
Cuisines_World Cuisine, Mexican, Italian       bool
Cuisines_World Cuisine, Patisserie, Cafe       bool
Length: 1972, dtype: object


**Check for NaN (Missing Values)**

In [30]:
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

**Train a Linear Regression Model**

In [37]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

**Make Predictions with the Model**

In [39]:
y_pred = lr_model.predict(x_test)

**Evaluate Model Performance**

In [40]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 1.910967390370041
R2 Score: 0.16552815745588345


### **Experiment with different algorithms (e.g., linear regression, decision trees, random forest) and compare their performance.**

In [41]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

# Upload the csv file
uploaded = files.upload()

# Get the correct filename
filename = list(uploaded.keys())[0]

# Read the CSV file
df = pd.read_csv(filename)

# Check if the dataset is loaded correctly
print(df.head())

Saving Dataset .csv to Dataset  (4).csv


Saving Dataset .csv to Dataset  (5).csv
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La,

**Drop Unnecessary Columns**

In [42]:
columns_to_drop = ['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Rating color', 'Rating text', 'Currency', 'Switch to order menu']
df = df.drop(columns=columns_to_drop)

**Convert categorical variables to numerical**

**Convert Yes/No categorical features to 1/0**

In [43]:
df["Has Table booking"] = df["Has Table booking"].map({"Yes": 1, "No" : 0})
df["Has Online delivery"] = df["Has Online delivery"].map({"Yes": 1, "No": 0})
df["Is delivering now"] = df["Is delivering now"].map({"Yes": 1, "No": 0})

**Handle Missing Values**

In [44]:
df = df.dropna()

**Select Features (x) and Target Variable (y)**

In [45]:
x = df.drop(columns=['Aggregate rating'])   # Features
y = df['Aggregate rating']            # Target

**Split Data into Training and Testing Sets**

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

**Convert Categorical Variables to Numeric**

In [47]:
print(x_train.dtypes)

Country Code              int64
City                     object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Has Table booking         int64
Has Online delivery       int64
Is delivering now         int64
Price range               int64
Votes                     int64
dtype: object


In [48]:
x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

In [49]:
x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

**Train Multiple Regression Models**

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize models
models = {
     'Linear Regression': LinearRegression(),
     'Decision Tree': DecisionTreeRegressor(random_state=42),
     'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train models
for name, model in models.items():
  model.fit(x_train, y_train)
  print(f"{name} training complete!")

Linear Regression training complete!
Decision Tree training complete!
Random Forest training complete!


In [51]:
# Make predictions for each model
predictions = {}
for name, model in models.items():
  predictions[name] = model.predict(x_test)

**Evaluate the Models**

In [52]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate each model
for name, y_pred in predictions.items():
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  print(f"{name}: MSE = {mse:.3f}, R2 Score = {r2:.3f}")

Linear Regression: MSE = 1.911, R2 Score = 0.166
Decision Tree: MSE = 0.164, R2 Score = 0.929
Random Forest: MSE = 0.091, R2 Score = 0.960


## **Compare Results**

**Linear Regression: MSE = 1.911, R2 Score = 0.166**

**Decision Tree: MSE = 0.164, R2 Score = 0.929**

**Random Forest: MSE = 0.091, R2 Score = 0.960**

# **Akankshya Sharma**