# Selecting the best model with Best hyperparameters

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Tasks

In [4]:
# select features and variables
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

### Evaluation on the basis of Mean Absolute Error:

In [5]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
model_scores = []

for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
    
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}") 

Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for GradientBoostingRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.76
Mean Absolute error for DecisionTreeRegressor is  0.98
CPU times: total: 641 ms
Wall time: 449 ms


### Evaluation on the basis of R_squared Score:

In [6]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
    
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 

R_squared Score SVR is  0.57
R_squared Score LinearRegression is  0.44
R_squared Score XGBRegressor is  0.41
R_squared Score GradientBoostingRegressor is  0.35
R_squared Score KNeighborsRegressor is  0.33
R_squared Score RandomForestRegressor is  0.25
R_squared Score DecisionTreeRegressor is -0.09
CPU times: total: 547 ms
Wall time: 467 ms


### Evaluation on the basis of Mean Squared Error:

In [7]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')

# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Squared error for', f"{model[0]} is {model[1]: .2f}") 

Mean Squared error for SVR is  0.54
Mean Squared error for LinearRegression is  0.69
Mean Squared error for XGBRegressor is  0.74
Mean Squared error for GradientBoostingRegressor is  0.81
Mean Squared error for KNeighborsRegressor is  0.84
Mean Squared error for RandomForestRegressor is  0.96
Mean Squared error for DecisionTreeRegressor is  1.15
CPU times: total: 547 ms
Wall time: 476 ms


# Hyperparameter Tuning

In [8]:
%%time
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(),{}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor R2:  0.298051667053291
DecisionTreeRegressor MAE:  0.718948162948163


RandomForestRegressor MSE:  0.8920837073469396
RandomForestRegressor R2:  0.2863166738016345
RandomForestRegressor MAE:  0.765187755102041


KNeighborsRegressor MSE:  0.6640950568462677
KNeighborsRegressor R2:  0.4687117753876745
KNeighborsRegressor MAE:  0.6203721488595437


GradientBoostingRegressor MSE:  0.8106801524004932
GradientBoostingRegressor R2:  0.35144101065487676
GradientBoostingRegressor MAE:  0.7657809818712309


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472


CPU times: total: 4.55 s
Wall time: 6.66 s


### To get best parameters of each model, using a for loop

In [9]:
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(),{}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),         
          }

# initialize the variables to track the best model and its performance
best_model = None
best_mse = float('inf')
best_r2 = -float('inf')
best_mae = float('inf')

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models 
for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)

    # calulate the evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
      
    # print the performing metric
    print(name, 'MSE: ', mse)
    print(name, 'R2: ', r2)
    print(name, 'MAE: ',mae)
    print('\n')

    # check if this model has better performance
    if mse < best_mse:
        best_model = pipeline
        best_mse = mse
        best_r2 = r2
        best_mae = mae

    # print the best model's performance metrics
    print('Best Model :', best_model.best_estimator_)
    print('Best MSE:', best_mse)
    print('Best MAE:', best_mae)
    print('Best R2:', best_r2)

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


Best Model : LinearRegression()
Best MSE: 0.6948129686287711
Best MAE: 0.6703807496461157
Best R2: 0.4441368826121931
SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


Best Model : LinearRegression()
Best MSE: 0.6948129686287711
Best MAE: 0.6703807496461157
Best R2: 0.4441368826121931
DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor R2:  0.298051667053291
DecisionTreeRegressor MAE:  0.7189481629481629


Best Model : LinearRegression()
Best MSE: 0.6948129686287711
Best MAE: 0.6703807496461157
Best R2: 0.4441368826121931
RandomForestRegressor MSE:  0.9284110248979601
RandomForestRegressor R2:  0.25725415353794623
RandomForestRegressor MAE:  0.7654408163265306


Best Model : LinearRegression()
Best MSE: 0.6948129686287711
Best MAE: 0.6703807496461157
Best R2: 0.4441368826121931
KNeighborsRegressor MSE:  0.6

# Add preprocessor inside the pipeline

In [13]:
# Make a preprocessor
preprocessor = ColumnTransformer(
    transformers=[('numeric_scaling', StandardScaler(), ['total_bill', 'size'])],
    remainder='passthrough'
)

# Create a dictionary of models to evaluate performance with hyperparameters
models = { 
    'LinearRegression': (LinearRegression(), {}),
    'SVR': (SVR(), {'model__kernel': ['rbf', 'poly', 'sigmoid']}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'model__max_depth': [None, 5, 10]}),
    'RandomForestRegressor': (RandomForestRegressor(), {'model__n_estimators': [10, 100]}),
    'KNeighborsRegressor': (KNeighborsRegressor(), {'model__n_neighbors': np.arange(3, 100, 2)}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'model__n_estimators': [10, 100]}),
    'XGBRegressor': (XGBRegressor(), {'model__n_estimators': [10, 100]}),
}

# Train and predict each model with evaluation metrics using a for loop to iterate over the models
for name, (model, params) in models.items():
    # Create a pipeline with preprocessor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])   
    
    # Make a grid search to tune the hyperparameters
    grid_search = GridSearchCV(pipeline, params, cv=5)
    
    # Fit the pipeline
    grid_search.fit(X_train, y_train)
    
    # Make predictions from each model
    y_pred = grid_search.predict(X_test)
    
    # Print the performance metrics
    print(f"{name} MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"{name} R2: {r2_score(y_test, y_pred)}")
    print(f"{name} MAE: {mean_absolute_error(y_test, y_pred)}\n")


LinearRegression MSE: 0.6948129686287682
LinearRegression R2: 0.4441368826121954
LinearRegression MAE: 0.6703807496461148

SVR MSE: 0.6213485529050293
SVR R2: 0.502909762487858
SVR MAE: 0.6122661870970085

DecisionTreeRegressor MSE: 0.8774153020453993
DecisionTreeRegressor R2: 0.298051667053291
DecisionTreeRegressor MAE: 0.7189481629481629

RandomForestRegressor MSE: 0.9640500142857159
RandomForestRegressor R2: 0.2287423084284268
RandomForestRegressor MAE: 0.7683673469387756

KNeighborsRegressor MSE: 0.7266209120685312
KNeighborsRegressor R2: 0.4186899445203298
KNeighborsRegressor MAE: 0.6984580498866213

GradientBoostingRegressor MSE: 0.8106801524004931
GradientBoostingRegressor R2: 0.3514410106548769
GradientBoostingRegressor MAE: 0.7657809818712309

XGBRegressor MSE: 0.6624107100882575
XGBRegressor R2: 0.4700592836840687
XGBRegressor MAE: 0.6549163442728472



# Classifiers

In [14]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()

Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

Classifier: SVM
Mean Accuracy: 0.9666666666666668

Classifier: KNN
Mean Accuracy: 0.9733333333333334

