<a href="https://colab.research.google.com/github/ABHAY7238/Road-to-Data-Scientist-/blob/main/Best_model_hyperparameter_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SELECTING THE BEST MODEL WITH BEST HYPERPARAMETR

In [None]:
#import libraries
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#libraries for train_test_split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder

#import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score , mean_absolute_error , mean_squared_error
from xgboost import XGBRegressor
#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

#import preprocessor
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




In [None]:
#load the dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [None]:
#encode the categorical columns of the dataset
encoder = LabelEncoder()
df['sex'] = encoder.fit_transform(df['sex'])
df['smoker'] = encoder.fit_transform(df['smoker'])
df['day'] = encoder.fit_transform(df['day'])
df['time'] = encoder.fit_transform(df['time'])


In [None]:
#split the data into X and y
X = df.drop('tip' , axis = 1)
y = df['tip']

#now train_test_split the data
X_train , X_test , y_train , y_test = train_test_split(X , y , random_state = 42 , test_size = 0.2)

In [None]:
#create a dictionaries of list of models to evaluate the performances
models  = {
    'LinearRegression' : LinearRegression(),
    'SVR' : SVR(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'RandomForestRegressor' : RandomForestRegressor(),
    'KNeighborsRegressor' : KNeighborsRegressor(),
    'AdaBoostRegressor' : AdaBoostRegressor(),
    'GradientBoostingRegressor' : GradientBoostingRegressor(),
    #'XGBRegressor' : XGBRegressor()
}

#now train and predict each model with evaluation metrics for making a loop
model_scores = []
for model_name , model in models.items():
    model.fit(X_train , y_train)

    #make predictions from each models
    y_pred = model.predict(X_test)
    metrics =  mean_absolute_error(y_test , y_pred)
    model_scores.append((model_name ,metrics))


#selecting the best model from all of the above models with evaluation metrics
sorted_models = sorted(model_scores , key = lambda x : x[1])
for model_name , metrics in sorted_models:
  print(f"mean_Absolute_error for {model_name} is {metrics: .2f}")

mean_Absolute_error for SVR is  0.57
mean_Absolute_error for LinearRegression is  0.67
mean_Absolute_error for KNeighborsRegressor is  0.73
mean_Absolute_error for GradientBoostingRegressor is  0.73
mean_Absolute_error for RandomForestRegressor is  0.74
mean_Absolute_error for DecisionTreeRegressor is  0.86
mean_Absolute_error for AdaBoostRegressor is  0.91


**HYPERPARAMETER TUNING**

In [None]:
  #create a dictionaries of list of models to evaluate the performance with hyperparameter tuning
  models  = {
    'LinearRegression' : (LinearRegression(), {}),
    'SVR' : (SVR() , {'kernel': ['rbf' , 'poly' , 'sigmoid'] , 'C' : [0.1 , 1 , 10] , 'gamma' : [1,0.1,0.001]}),
    'DecisionTreeRegressor' : (DecisionTreeRegressor() , {'max_depth' : ['NONE',5,10] , 'splitter' : ['best' , 'random']}),
    'RandomForestRegressor' : (RandomForestRegressor() , {'n_estimators' : [10,100], 'max_depth' : [5,10] , 'min_samples_split' : [2,5]}),
    'KNeighborsRegressor' : (KNeighborsRegressor() , {'n_neighbors' : np.arange(3,100,2)}),
    'AdaBoostRegressor' : (AdaBoostRegressor() , {'n_estimators' : [10,100,1000]}),
    'GradientBoostingRegressor' : (GradientBoostingRegressor() , {'n_estimators' : [10,100,1000]})
  }

In [None]:
#train and predict each model with evaluation metrics as well as making a for loop to iterate over
for model_name , (model , param_grid) in models.items():
  #create a pipeline
  pipeline = GridSearchCV(model , param_grid , cv = 5)
  pipeline.fit(X_train , y_train)

  #make predictions from each models
  y_pred = pipeline.predict(X_test)

#now print the performing metrics
print(model_name , 'MSE : ' , mean_squared_error(y_test , y_pred))
print(model_name , 'R2_score' , r2_score(y_test , y_pred))
print(model_name , 'MAE' , mean_absolute_error(y_test , y_pred))
print('/n')