# Selecting the best model with Best Parameters

In [7]:
# import libraeies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# importing the Regressor Alogorithems
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRFRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

# import the metics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# improt the gridsearchcv
from sklearn.model_selection import GridSearchCV,train_test_split
# import the preprocessor
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
# load the dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
# split the data into X and y
X = df.drop('tip',axis=1)
y = df['tip']

# encode the categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(df['sex'])
X['smoker'] = le.fit_transform(df['smoker'])
X['day'] = le.fit_transform(df['day'])
X['time'] = le.fit_transform(df['time'])

In [12]:
# train test split 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# here make a dictionary for all the model with thier name
models = {
    'SVR':SVR(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRFRegressor':XGBRFRegressor(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'LinearRegression':LinearRegression()
}

# train and predict each model with evaluation metris as well make for loop to itrate over models
model_scores = []
for name,model in models.items():
    model.fit(X_train,y_train)
    
    # make predictions of each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test,y_pred)
    model_scores.append((name,metric))
    
    
    
sorted_models = sorted(model_scores,key=lambda x:x[1],reverse=False)
for model in sorted_models:
    print('mean_absolute_error for',f'{model[0]} is {model[1]:.2f}')


mean_absolute_error for SVR is 0.57
mean_absolute_error for LinearRegression is 0.67
mean_absolute_error for GradientBoostingRegressor is 0.73
mean_absolute_error for KNeighborsRegressor is 0.73
mean_absolute_error for XGBRFRegressor is 0.75
mean_absolute_error for RandomForestRegressor is 0.76
mean_absolute_error for DecisionTreeRegressor is 0.83


------
# Hyperparameter Tuning

In [17]:
models = {
    'SVR':(SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'max_depth':[None,5,10]}),
    'RandomForestRegressor':(RandomForestRegressor(),{'n_estimators' : [10,100]}),
    'GradientBoostingRegressor':(GradientBoostingRegressor(), {'n_estimators' : [10,100]}),
    'XGBRFRegressor':(XGBRFRegressor(), {'n_estimators' : [10,100]}),
    'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
    'LinearRegression':(LinearRegression(), {})
}

# train and predict each model with evaluation metrics for loop for each model
for name,(model,params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model,params,cv=5)
    
    # fit the pipline on training data
    pipeline.fit(X_train,y_train)
    # predict
    y_pred = pipeline.predict(X_test)
    
    # printing the evaluation metrics
    print(name,'MSE',mean_squared_error(y_test,y_pred))
    print(name,'MAE',mean_absolute_error(y_test,y_pred))
    print(name,'R2',r2_score(y_test,y_pred))
    print('\n')

SVR MSE 1.460718141299992
SVR MAE 0.8935334948775431
SVR R2 -0.1686013018011976


DecisionTreeRegressor MSE 0.8774153020453993
DecisionTreeRegressor MAE 0.7189481629481629
DecisionTreeRegressor R2 0.298051667053291


RandomForestRegressor MSE 0.9313885863265319
RandomForestRegressor MAE 0.7715326530612246
RandomForestRegressor R2 0.2548720497882624


GradientBoostingRegressor MSE 0.8106801524004932
GradientBoostingRegressor MAE 0.7657809818712309
GradientBoostingRegressor R2 0.35144101065487676


XGBRFRegressor MSE 0.8832473346171744
XGBRFRegressor MAE 0.7501231595448086
XGBRFRegressor R2 0.29338593403962554


KNeighborsRegressor MSE 0.6640950568462677
KNeighborsRegressor MAE 0.6203721488595437
KNeighborsRegressor R2 0.4687117753876745


LinearRegression MSE 0.6948129686287711
LinearRegression MAE 0.6703807496461157
LinearRegression R2 0.4441368826121931


