# Select the best model with Best Hyperparameters

In [6]:
# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error , r2_score

# import grid search cv fot cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load the dataset
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Tasks

In [5]:
# select features and variables 
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables 
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [65]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

# Create a dictionaries of list of models to evaluate performance 
models = {
         'LinearRegression' : LinearRegression(),
         'SVR' : SVR(),
         'DecisionTreeRegressor' : DecisionTreeRegressor(),
         'RandomForestRegressor' : RandomForestRegressor(),
         'KNeighborsRegressor' : KNeighborsRegressor(),
         'GradientBoostingRegressor' : GradientBoostingRegressor(),
         'XGBRegressor' : XGBRegressor()
}

# train and predict each model with evaluation 

model_scores = []
for name, model in models.items():

    # fit each model from models on training data 
    model.fit(X_train, y_train)

    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evaluation metrics sort
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
        print("Mean Absolute error for" , f"{model[0]} is {model[1]: .2f}")

Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for GradientBoostingRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.79
Mean Absolute error for DecisionTreeRegressor is  0.89
CPU times: total: 719 ms
Wall time: 810 ms


In [11]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

# Create a dictionaries of list of models to evaluate performance 
models = {
         'LinearRegression' : LinearRegression(),
         'SVR' : SVR(),
         'DecisionTreeRegressor' : DecisionTreeRegressor(),
         'RandomForestRegressor' : RandomForestRegressor(),
         'KNeighborsRegressor' : KNeighborsRegressor(),
         'GradientBoostingRegressor' : GradientBoostingRegressor(),
         'XGBRegressor' : XGBRegressor()
}

# train and predict each model with evaluation 

model_scores = []
for name, model in models.items():

    # fit each model from models on training data 
    model.fit(X_train, y_train)

    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evaluation metrics sort
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
        print("Mean Squared error for" , f"{model[0]} is {model[1]: .2f}")

Mean Squared error for SVR is  0.54
Mean Squared error for LinearRegression is  0.69
Mean Squared error for XGBRegressor is  0.74
Mean Squared error for GradientBoostingRegressor is  0.80
Mean Squared error for KNeighborsRegressor is  0.84
Mean Squared error for RandomForestRegressor is  0.94
Mean Squared error for DecisionTreeRegressor is  1.40
CPU times: total: 750 ms
Wall time: 828 ms


# Assignment: Find the best model based on each metrics from above mentioned results? with Diamonds dataset

In [14]:
diamonds = sns.load_dataset('diamonds')

---

# Hyperparameter tuning:

In [44]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

# Create a dictionaries of list of models to evaluate performance 
models = {
         'LinearRegression' : (LinearRegression(), {}),
         'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
         'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
         'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10,100]}),
         'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
         'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
         'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10,100]}),
}

# train and predict each model with evaluation metrics as well making a for loop to iterate over

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)

    # fit the pipline 
    pipeline.fit(X_train, y_train)

    # make prediction from each model
    y_pred = pipeline.predict(X_test)

# print the performing metrics
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor R2:  0.298051667053291
DecisionTreeRegressor MAE:  0.7189481629481629


RandomForestRegressor MSE:  0.9767159926530623
RandomForestRegressor R2:  0.21860929344752578
RandomForestRegressor MAE:  0.7788734693877551


KNeighborsRegressor MSE:  0.6640950568462677
KNeighborsRegressor R2:  0.4687117753876745
KNeighborsRegressor MAE:  0.6203721488595437


GradientBoostingRegressor MSE:  0.8106801524004931
GradientBoostingRegressor R2:  0.3514410106548769
GradientBoostingRegressor MAE:  0.7657809818712309


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472


CPU times: total: 7.5 s
Wall time: 6.05 s


In [45]:
%%time 
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = {
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber','quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]})
}

# train and predict each model with evaluation metrics as welll making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)

    # fit the pipline
    pipeline.fit(X_train, y_train)

    # make prediction from each model
    y_pred = pipeline.predict(X_test)

    # print the performing metric
    print(name, 'MSE:' , mean_squared_error(y_test, y_pred))
    print(name, 'R2:' , r2_score(y_test, y_pred))
    print(name, 'MAE:' , mean_absolute_error(y_test, y_pred))


LinearRegression MSE: 0.6948129686287711
LinearRegression R2: 0.4441368826121931
LinearRegression MAE: 0.6703807496461157


# Assignment: How to get best parameters of each model, write it the for loop among the code, how to get the best model out of it?

# Soluion

---

# Add preprocessor inside the pipeline

# Clsssifier

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# create a dictionary of classifier to evaluate
classifier ={
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# perform k-fold cross-validation and calculation the mean accuracy 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifier.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()

# Main Assignment:

 Write the complete code to select the best Regressor and classifier for the given dataset called diamonds `(if you have a high end machine, you can use the whole dataset, else use the sample dataset provided in the link)` or you can use Tips dataset for Regression task and Iris dataset for Classification task.

 You have to choose all possible models with their best or possible hyperparameters and compare them with each other and select the best model for the given dataset.

 Your code should be complete and explained properly. for layman, each and every step of the code should be commented properly.

 Your code should also save the best model in the pickle file.

 You should also write the code to load the pickle file and use it for prediction. in the last snippet of the code

 Submit your assignment to the discord inbox.