In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import accuracy_score, confusion_matrix , roc_curve, auc,r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('data/complete_eda_csv.csv')

In [3]:
X = df.drop(columns=['rate'],axis=1)
Y = df['rate']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30687 entries, 0 to 30686
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   online_order     30687 non-null  object 
 1   book_table       30687 non-null  object 
 2   rate             30687 non-null  float64
 3   votes            30687 non-null  int64  
 4   location         30687 non-null  object 
 5   cuisines         30687 non-null  int64  
 6   costing          30687 non-null  int64  
 7   listed_in(type)  30687 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 1.9+ MB


In [5]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
#from sklearn.preprocessing import LabelEncoder # Label Encoding
from sklearn.preprocessing import OneHotEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories='auto'))
    ]
)

In [7]:
numerical_cols

Index(['votes', 'cuisines', 'costing'], dtype='object')

In [7]:
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)
X_train.shape,X_test.shape

((21480, 107), (9207, 107))

Create a Evaluation function

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [11]:
def eval_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [12]:
models={
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'SVR':SVR()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    model_train_mae ,  model_train_rmse,model_train_r2 = eval_model(y_train,y_train_pred)
    model_test_mae , model_test_rmse,model_test_r2 = eval_model(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))

    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))

    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')
    


LinearRegression
Model performance for Training set
- Root Mean Squared Error: 0.2750
- Mean Absolute Error: 0.2140
- R2 Score: 0.2909
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2734
- Mean Absolute Error: 0.2128
- R2 Score: 0.2921


DecisionTreeRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0260
- Mean Absolute Error: 0.0041
- R2 Score: 0.9937
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1828
- Mean Absolute Error: 0.0769
- R2 Score: 0.6836


RandomForestRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0575
- Mean Absolute Error: 0.0313
- R2 Score: 0.9690
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1447
- Mean Absolute Error: 0.0808
- R2 Score: 0.8018


SVR
Model performance for Training set
- Root Mean Squared Error: 0.2285
- Mean Absolute Error: 0.1700
- R2 Score: 0.5105
----------

In [14]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,RandomForestRegressor,0.80182
1,DecisionTreeRegressor,0.683584
3,SVR,0.456721
0,LinearRegression,0.292071
