# Restaurent Revenue Prediction


In [None]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from scipy.stats import uniform, randint
import pickle

**Data Preprocessing**

In [None]:
df = pd.read_csv('restaurant_data.csv')

In [None]:
df.head()

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,Restaurant 0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Restaurant 1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Restaurant 2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Restaurant 3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.8
4,Restaurant 4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35


In [None]:
df.columns

Index(['Name', 'Location', 'Cuisine', 'Rating', 'Seating Capacity',
       'Average Meal Price', 'Marketing Budget', 'Social Media Followers',
       'Chef Experience Years', 'Number of Reviews', 'Avg Review Length',
       'Ambience Score', 'Service Quality Score', 'Parking Availability',
       'Weekend Reservations', 'Weekday Reservations', 'Revenue'],
      dtype='object')

In [None]:
df.drop('Name', axis=1, inplace=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8368 entries, 0 to 8367
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                8368 non-null   object 
 1   Cuisine                 8368 non-null   object 
 2   Rating                  8368 non-null   float64
 3   Seating Capacity        8368 non-null   int64  
 4   Average Meal Price      8368 non-null   float64
 5   Marketing Budget        8368 non-null   int64  
 6   Social Media Followers  8368 non-null   int64  
 7   Chef Experience Years   8368 non-null   int64  
 8   Number of Reviews       8368 non-null   int64  
 9   Avg Review Length       8368 non-null   float64
 10  Ambience Score          8368 non-null   float64
 11  Service Quality Score   8368 non-null   float64
 12  Parking Availability    8368 non-null   object 
 13  Weekend Reservations    8368 non-null   int64  
 14  Weekday Reservations    8368 non-null   

In [None]:
df.describe()

Unnamed: 0,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Weekend Reservations,Weekday Reservations,Revenue
count,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0,8368.0
mean,4.008258,60.212835,47.896659,3218.2549,36190.621773,10.051984,523.010397,174.769974,5.521283,5.508772,29.491754,29.235301,656070.6
std,0.581474,17.399488,14.336767,1824.896053,18630.15333,5.516606,277.215127,71.99806,2.575442,2.586552,20.025415,20.004277,267413.7
min,3.0,30.0,25.0,604.0,5277.0,1.0,50.0,50.011717,1.0,1.0,0.0,0.0,184708.5
25%,3.5,45.0,35.49,1889.0,22592.5,5.0,277.0,113.311102,3.3,3.2,13.0,13.0,454651.4
50%,4.0,60.0,45.535,2846.5,32518.5,10.0,528.0,173.910079,5.5,5.6,27.0,26.0,604242.1
75%,4.5,75.0,60.3,4008.5,44566.25,15.0,764.25,237.406885,7.8,7.8,43.0,43.0,813094.2
max,5.0,90.0,76.0,9978.0,103777.0,19.0,999.0,299.984924,10.0,10.0,88.0,88.0,1531868.0


**EDA**

In [None]:
#get categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
cat_cols

['Location', 'Cuisine', 'Parking Availability']

In [None]:
# Initialize the encoder
encoder = OrdinalEncoder()

# Fit and transform the data
df[cat_cols]=  encoder.fit_transform(df[cat_cols])

#print the encoded data
print(df)

# Specify the file path where you want to save the encoder
filename = 'ordinal_encoder.pkl'

# Open a file in binary write mode
with open(filename, 'wb') as f:
    # Use pickle to dump the encoder object into the file
    pickle.dump(encoder, f)

      Location  Cuisine  Rating  Seating Capacity  Average Meal Price  \
0          1.0      4.0     4.0                38               73.98   
1          0.0      5.0     3.2                76               28.11   
2          1.0      3.0     4.7                48               48.29   
3          1.0      3.0     4.4                34               51.55   
4          0.0      4.0     4.9                88               75.98   
...        ...      ...     ...               ...                 ...   
8363       2.0      2.0     3.4                54               34.85   
8364       1.0      2.0     3.7                49               36.88   
8365       0.0      3.0     4.7                88               46.87   
8366       1.0      0.0     3.1                31               44.53   
8367       1.0      4.0     4.0                33               71.07   

      Marketing Budget  Social Media Followers  Chef Experience Years  \
0                 2224                   23406    

In [None]:
#convert float to int
df[cat_cols] = encoder.fit_transform(df[cat_cols]).astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8368 entries, 0 to 8367
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                8368 non-null   int64  
 1   Cuisine                 8368 non-null   int64  
 2   Rating                  8368 non-null   float64
 3   Seating Capacity        8368 non-null   int64  
 4   Average Meal Price      8368 non-null   float64
 5   Marketing Budget        8368 non-null   int64  
 6   Social Media Followers  8368 non-null   int64  
 7   Chef Experience Years   8368 non-null   int64  
 8   Number of Reviews       8368 non-null   int64  
 9   Avg Review Length       8368 non-null   float64
 10  Ambience Score          8368 non-null   float64
 11  Service Quality Score   8368 non-null   float64
 12  Parking Availability    8368 non-null   int64  
 13  Weekend Reservations    8368 non-null   int64  
 14  Weekday Reservations    8368 non-null   

In [None]:
# Define features and target
X = df.drop('Revenue', axis=1)
y = df['Revenue']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
df.head(34)

Unnamed: 0,Menu_Price,Marketing_Spend,Cuisine_Type,Average_Customer_Spending,Promotions,Reviews,Monthly_Revenue
0,43.117635,12.663793,2,36.236133,0,45,350.91204
1,40.020077,4.577892,1,17.952562,0,36,221.319091
2,41.981485,4.652911,2,22.60042,1,91,326.529763
3,43.005307,4.416053,1,18.984098,1,59,348.190573
4,17.456199,3.475052,1,12.766143,1,30,185.009121
5,19.42767,13.114473,0,43.09995,1,10,399.867507
6,35.350343,16.960318,0,20.181622,0,86,496.653687
7,46.314645,14.486349,1,28.9211,1,1,417.158607
8,22.647863,15.841873,0,11.732611,0,26,352.148065
9,33.532267,8.095969,1,37.973579,0,98,272.793517


In [None]:

X_train

array([[-1.21227514, -0.86154742, -0.69393123, ..., -1.00779856,
        -0.27795463, -1.36263792],
       [ 1.22355551, -1.446634  , -1.55379863, ..., -1.00779856,
         0.56821537,  0.83245239],
       [-1.21227514,  0.30862574,  0.85383009, ...,  0.99226179,
         2.70852772, -0.96353059],
       ...,
       [-1.21227514, -1.446634  ,  1.02580357, ..., -1.00779856,
         1.61348419,  0.98211764],
       [ 1.22355551,  1.4787989 , -0.17801079, ..., -1.00779856,
        -1.12412463, -0.66420009],
       [-1.21227514,  0.30862574, -0.17801079, ...,  0.99226179,
         1.61348419,  0.38345665]])

**Models Selection**

In [None]:
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Lasso': {
        'model': Lasso(random_state=42),
        'params': {
            'alpha': uniform(0.01, 100)
        }
    },
    'Ridge': {
        'model': Ridge(random_state=42),
        'params': {
            'alpha': uniform(0.01, 100)
        }
    },
    'ElasticNet': {
        'model': ElasticNet(random_state=42),
        'params': {
            'alpha': uniform(0.01, 100),
            'l1_ratio': uniform(0.1, 0.9)
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [None, 3, 5, 7, 10],
            'min_samples_split': randint(2, 11),
            'min_samples_leaf': randint(1, 5)
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': randint(50, 201),
            'max_depth': [None, 3, 5, 7, 10],
            'min_samples_split': randint(2, 11),
            'min_samples_leaf': randint(1, 5)
        }
    },
    'AdaBoostRegressor': {
        'model': AdaBoostRegressor(random_state=42),
        'params': {
            'n_estimators': randint(50, 201),
            'learning_rate': uniform(0.01, 1.0)
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': randint(50, 201),
            'learning_rate': uniform(0.01, 0.2),
            'max_depth': randint(3, 8)
        }
    },
    'XGBRegressor': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': randint(50, 201),
            'learning_rate': uniform(0.01, 0.2),
            'max_depth': randint(3, 8)
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': randint(3, 10),
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },

}

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Function to calculate adjusted R^2 score
def adjusted_r2_score(r2, n, k):
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

In [None]:
# Function to evaluate a model
def evaluate_model(model, params, X_train, y_train, X_test, y_test):
    grid_search = RandomizedSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
    grid_search.fit(X_train, y_train.values.ravel())

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    n = X_test.shape[0]
    k = X_test.shape[1]
    adj_r2 = adjusted_r2_score(r2, n, k)

    mse = mean_squared_error(y_test, y_pred)

    print(f"Model: {best_model}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    print(f"Adjusted R^2 Score: {adj_r2}")
    print("=======================================")

    return best_model, adj_r2

**Models Training**

In [None]:
# Dictionaries to store adjusted R² scores and best models
adjusted_r2_scores = {}
all_models = {}

for model_name, model_info in models.items():
    print(f"Evaluating and Training {model_name}...")
    best_model, adj_r2 = evaluate_model(model_info['model'], model_info['params'], X_train, y_train, X_test, y_test)
    adjusted_r2_scores[model_name] = adj_r2
    all_models[model_name] = best_model

    # Save the model
    with open(f"{model_name}.pkl", 'wb') as file:
        pickle.dump(best_model, file)
    print(f"Model {model_name} saved to {model_name}.pkl")

Evaluating and Training LinearRegression...
Model: LinearRegression()
Best parameters: {}
Mean Squared Error: 3189399729.3168807
R^2 Score: 0.9554999472529242
Adjusted R^2 Score: 0.9550973532895912
Model LinearRegression saved to LinearRegression.pkl
Evaluating and Training Lasso...
Model: Lasso(alpha=95.08143064099163, random_state=42)
Best parameters: {'alpha': 95.08143064099163}
Mean Squared Error: 3186529953.3945503
R^2 Score: 0.955539987759212
Adjusted R^2 Score: 0.9551377560441264
Model Lasso saved to Lasso.pkl
Evaluating and Training Ridge...
Model: Ridge(alpha=5.818361216819946, random_state=42)
Best parameters: {'alpha': 5.818361216819946}
Mean Squared Error: 3188719747.551108
R^2 Score: 0.9555094346884958
Adjusted R^2 Score: 0.9551069265584159
Model Ridge saved to Ridge.pkl
Evaluating and Training ElasticNet...
Model: ElasticNet(alpha=2.0684494295802445, l1_ratio=0.9729188669457949,
           random_state=42)
Best parameters: {'alpha': 2.0684494295802445, 'l1_ratio': 0.97291