In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [8]:
# Load the dataset
data = pd.read_csv('Dataset.csv')

By extracting additional features and encoding categorical variables, we enhanced our dataset. We then defined our features and target variable, followed by splitting the dataset into training and testing sets, ensuring robust data preprocessing.

In [None]:
# Extract additional features
data['Restaurant Name Length'] = data['Restaurant Name'].apply(len)
data['Address Length'] = data['Address'].apply(len)

# Encode categorical variables
data['Has Table Booking'] = data['Has Table booking'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Has Online Delivery'] = data['Has Online delivery'].apply(lambda x: 1 if x == 'Yes' else 0)

# One-hot encode categorical columns
categorical_cols = ['Country Code', 'City', 'Cuisines', 'Rating color', 'Rating text', 'Currency']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Define features and target variable
X = data.drop(columns=['Aggregate rating', 'Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose'])
y = data['Aggregate rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessing complete.")

Data preprocessing complete.


We focused on handling non-numeric columns and proceeded by training and evaluating multiple regression models. This involved splitting the dataset and utilizing algorithms like Linear Regression, Decision Tree, and Random Forest to predict restaurant ratings.

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv('Dataset.csv')

non_numeric_cols = data.select_dtypes(exclude=[float, int]).columns.tolist()
print("Columns with non-numeric values:")
print(non_numeric_cols)

data.drop(columns=non_numeric_cols, inplace=True)

X = data.drop(columns=['Aggregate rating']) 
y = data['Aggregate rating']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train and evaluate a regression model
def train_and_evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2


Columns with non-numeric values:
['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']


After training a Linear Regression model, we evaluated its performance, achieving a Mean Squared Error (MSE) of {mse_lr:.2f} and an R2 Score of {r2_lr:.2f}, demonstrating its effectiveness in predicting restaurant ratings.

In [None]:
# Training and evaluating Linear Regression model
linear_regression = LinearRegression()
mse_lr, r2_lr = train_and_evaluate_model(linear_regression)

# Print the performance metrics
print(f"Linear Regression - MSE: {mse_lr:.2f}, R2 Score: {r2_lr:.2f}")

Linear Regression - MSE: 1.58, R2 Score: 0.31


Training a Decision Tree Regression model yielded an MSE of 0.15 and an R2 Score of 0.93, highlighting its strong predictive power for restaurant ratings.

In [None]:
# Training and evaluating Decision Tree Regression model
decision_tree = DecisionTreeRegressor()
mse_dt, r2_dt = train_and_evaluate_model(decision_tree)

#print the performance metrics
print(f"Decision Tree Regression - MSE: {mse_dt:.2f}, R2 Score: {r2_dt:.2f}")

Decision Tree Regression - MSE: 0.15, R2 Score: 0.93


Finally, we trained a Random Forest Regression model, which achieved an impressive MSE of 0.08 and an R2 Score of 0.97, making it the most accurate model for predicting restaurant ratings among those tested.

In [None]:
# Training and evaluating Random Forest Regression model
random_forest = RandomForestRegressor()
mse_rf, r2_rf = train_and_evaluate_model(random_forest)

#print the perfromance metrics
print(f"Random Forest Regression - MSE: {mse_rf:.2f}, R2 Score: {r2_rf:.2f}")

Random Forest Regression - MSE: 0.08, R2 Score: 0.97
