## Building the Final Model

Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import logging
import pickle

Load and Split data into training and testing

In [2]:
# Load the CSV file into a pandas DataFrame
data = pd.read_csv('zomato_clean.csv')

# Split the data into training and testing sets with stratified sampling
train_data, test_data = train_test_split(data, test_size=0.01, random_state=42)

# Save the training and testing sets to separate CSV files
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

Set unwanted columns

In [3]:
unwanted_columns = ['name', 'type', 'dish_liked']

Write functions for preprocess data, create pipline and train model

In [4]:
# Add logging statement
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data(data, unwanted_columns):
    """
    Preprocess the data.

    Parameters:
    - data (pandas.DataFrame): The data to be preprocessed.
    - unwanted_columns (list): List of unwanted columns to drop.

    Returns:
    - pandas.DataFrame: The preprocessed data.
    """
    # Drop unwanted columns
    data.drop(unwanted_columns, axis=1, inplace=True)

    # Drop null values
    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)

    return data

def create_pipeline():
    """
    Create the pipeline for training an Extra Trees Regressor.

    Returns:
    - sklearn.pipeline.Pipeline: The pipeline for training the model.
    """
    # Define the columns for different transformations
    numeric_columns = ['votes']
    binary_columns = ['online_order', 'book_table']
    categorical_columns = ['location', 'rest_type', 'cuisines']

    # Create pipeline for preprocessing numeric features
    numeric_transformer = Pipeline([
        ('scaler', StandardScaler())
    ])

    # Create pipeline for preprocessing binary features
    binary_transformer = Pipeline([
        ('encoder', OrdinalEncoder())
    ])

    # Create pipeline for preprocessing categorical features
    categorical_transformer = Pipeline([
        ('encoder', OrdinalEncoder())
    ])

    # Combine the transformers using ColumnTransformer
    preprocessor = ColumnTransformer([
        ('numeric_preprocess', numeric_transformer, numeric_columns),
        ('binary_preprocess', binary_transformer, binary_columns),
        ('categorical_preprocess', categorical_transformer, categorical_columns)
    ])

    # Create the final pipeline with preprocessor and Extra Trees Regressor
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', ExtraTreesRegressor(n_estimators=120))
    ])

    return pipeline

def train_model(X, Y):
    """
    Train the Extra Trees Regressor model.

    Parameters:
    - X (pandas.DataFrame): The input features.
    - Y (pandas.Series): The target variable.
    - test_size (float): The proportion of the dataset to include in the test split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - sklearn.pipeline.Pipeline: The trained model.
    """
    # Create the pipeline
    pipeline = create_pipeline()

    # Fit the pipeline to the training data
    pipeline.fit(X, Y)

    return pipeline

Create and train the model

In [8]:
preprocessed_data

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost_for_2
0,Yes,No,4.0,465,Jayanagar,Casual Dining,"South Indian, North Indian, Chinese, Street Food",600.0
1,No,No,3.8,43,BTM,Quick Bites,South Indian,200.0
2,Yes,No,3.8,93,Kaggadasapura,"Takeaway, Delivery",Kerala,300.0
3,Yes,No,3.6,216,Thippasandra,Casual Dining,"North Indian, South Indian, Chinese, Seafood",550.0
4,Yes,No,4.1,3007,Koramangala 1st Block,Cafe,"Cafe, Burger, Italian, Salad",1000.0
...,...,...,...,...,...,...,...,...
23021,Yes,No,3.6,74,Koramangala 5th Block,Casual Dining,Bengali,700.0
23022,No,No,4.3,74,Residency Road,Cafe,"Cafe, Japanese",400.0
23023,Yes,No,3.9,443,Frazer Town,Casual Dining,"Biryani, North Indian, Mughlai",600.0
23024,No,No,3.8,177,JP Nagar,Quick Bites,"North Indian, Mangalorean, Chinese",400.0


In [9]:
preprocessed_data.dtypes

online_order     object
book_table       object
rate            float64
votes             int64
location         object
rest_type        object
cuisines         object
cost_for_2      float64
dtype: object

In [5]:
# Add logging statement
logging.info("Loading and preprocessing data...")

# Load the data
data = pd.read_csv('train_data.csv')

# Splitting data
X = data.drop(['rate'], axis=1)
Y = data['rate']

# Preprocess the data
preprocessed_data = preprocess_data(data, unwanted_columns)

# Train the model
model = train_model(X, Y)

# Save the trained model
pickle.dump(model, open('model.pkl', 'wb'))

# Add logging statement
logging.info("Model trained and saved.")

2023-07-16 18:37:01,511 - INFO - Loading and preprocessing data...
2023-07-16 18:37:09,365 - INFO - Model trained and saved.


Test the model

In [6]:
# Load the saved model
pipeline = pickle.load(open('model.pkl', 'rb'))

# Load new data for prediction
new_data = pd.read_csv('test_data.csv')

# Preprocess the new data using the pipeline
preprocessed_new_data = preprocess_data(new_data, unwanted_columns)

# Predict on the new data
predictions = pipeline.predict(preprocessed_new_data)

# Calculate evaluation metrics
mse = mean_squared_error(Y, pipeline.predict(X))
mae = mean_absolute_error(Y, pipeline.predict(X))
r2 = r2_score(Y, pipeline.predict(X))

# Save the predictions to a file
preprocessed_new_data['predictions'] = predictions
preprocessed_new_data.to_csv('predictions.csv', index=False)

# Add logging statement
logging.info("Predictions saved to 'predictions.csv'.")

# Print evaluation metrics
logging.info(f"Mean Squared Error (MSE): {mse}")
logging.info(f"Mean Absolute Error (MAE): {mae}")
logging.info(f"R-squared (R2) Score: {r2}")

# Export evaluation metrics and model name to a report file
report = f"Model: Extra Trees Regressor\n\n"
report += f"Mean Squared Error (MSE): {mse}\n"
report += f"Mean Absolute Error (MAE): {mae}\n"
report += f"R-squared (R2) Score: {r2}\n"

with open('report.txt', 'w') as file:
    file.write(report)

# Add logging statement
logging.info("Report saved to 'report.txt'.")

2023-07-16 18:38:57,193 - INFO - Predictions saved to 'predictions.csv'.
2023-07-16 18:38:57,195 - INFO - Mean Squared Error (MSE): 0.0001609802345275289
2023-07-16 18:38:57,198 - INFO - Mean Absolute Error (MAE): 0.0017049406869671607
2023-07-16 18:38:57,199 - INFO - R-squared (R2) Score: 0.9991145481276679
2023-07-16 18:38:57,204 - INFO - Report saved to 'report.txt'.
