# Hyperparameter Tuning with GridSearchCV

Now you have your training data and you have the algorithm that you want to use,

but how do you know what design of this algorithm is the best ?

that's exactly what hyperparameter turning gave us


In [575]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV , train_test_split

from sklearn.metrics import accuracy_score, precision_score,recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [547]:
titanic_df= pd.read_csv("Data/titanic_processed.csv")
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,17.0,0,0,12.0,True,False,False
1,0,1,1,36.0,1,0,78.85,False,False,True
2,0,3,0,14.5,1,0,14.4542,True,False,False
3,1,2,0,42.0,0,0,13.0,False,False,True
4,1,2,0,41.0,0,1,19.5,False,False,True


In [548]:
X= titanic_df.drop("Survived",axis=1)
Y= titanic_df["Survived"]

x_train, x_test, y_train, y_test= train_test_split(X,Y,test_size=0.2,random_state=42)


In [549]:
def summarize_classification(y_test,y_pred):

    acc= accuracy_score(y_test,y_pred,normalize=True)
    num_acc= accuracy_score(y_test,y_pred,normalize=False)

    prec= precision_score(y_test,y_pred)
    recall= recall_score(y_test,y_pred)

    print("Test Data Count: ", len(y_test))
    print("Accuracy Count: ",num_acc)
    print("Accuracy Score: ",acc)
    print("Precision Score: ",prec)
    print("Recall Score: ",recall)
    print()
    return {acc,prec,recall}

## Pruning for Perfection

In [568]:

parameters= {
    'criterion': ['gini', 'entropy'],
    'max_depth':[2,4,6,8,10],
   
    }

grid_seach= GridSearchCV(DecisionTreeClassifier(),parameters,cv=3,return_train_score=True)
grid_seach.fit(x_train,y_train)

grid_seach.best_params_


invalid value encountered in cast



{'criterion': 'entropy', 'max_depth': 6}

In [573]:
for i in range(10):
    print("parameters", grid_seach.cv_results_["params"][i])
    print("Mean Test Score", grid_seach.cv_results_['mean_test_score'][i])
    print("Rank", grid_seach.cv_results_['rank_test_score'][i])
    print("------------")

parameters {'criterion': 'gini', 'max_depth': 2}
Mean Test Score 0.7908010767659891
Rank 6
------------
parameters {'criterion': 'gini', 'max_depth': 4}
Mean Test Score 0.8066462452427364
Rank 4
------------
parameters {'criterion': 'gini', 'max_depth': 6}
Mean Test Score 0.8083635013459576
Rank 2
------------
parameters {'criterion': 'gini', 'max_depth': 8}
Mean Test Score 0.7855750487329435
Rank 8
------------
parameters {'criterion': 'gini', 'max_depth': 10}
Mean Test Score 0.7784646802190661
Rank 9
------------
parameters {'criterion': 'entropy', 'max_depth': 2}
Mean Test Score 0.7908010767659891
Rank 6
------------
parameters {'criterion': 'entropy', 'max_depth': 4}
Mean Test Score 0.803137473312912
Rank 5
------------
parameters {'criterion': 'entropy', 'max_depth': 6}
Mean Test Score 0.8188990996008539
Rank 1
------------
parameters {'criterion': 'entropy', 'max_depth': 8}
Mean Test Score 0.8066462452427365
Rank 3
------------
parameters {'criterion': 'entropy', 'max_depth': 10}

In [569]:

decision_tree_model= DecisionTreeClassifier(\
    max_depth=grid_seach.best_params_['max_depth'],criterion=grid_seach.best_params_['criterion'],
   
    ).fit(x_train,y_train)

In [570]:
y_pred=decision_tree_model.predict(x_test)

In [572]:
dt_scores=summarize_classification(y_test,y_pred)

Test Data Count:  143
Accuracy Count:  110.0
Accuracy Score:  0.7692307692307693
Precision Score:  0.7954545454545454
Recall Score:  0.5932203389830508



## Taming the Forest

In [576]:
parameters= {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
    }

grid_seach= GridSearchCV(RandomForestClassifier(),parameters,cv=3,return_train_score=True)
grid_seach.fit(x_train,y_train)

grid_seach.best_params_



324 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "f:\Project\AI\Build_ML_Solutions_with_sklearn\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\Project\AI\Build_ML_Solutions_with_sklearn\.venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "f:\Project\AI\Build_ML_Solutions_with_sklearn\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "f:\Project\AI\Build_ML_Solutions_with_sklearn\.venv\Lib\site-pa

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [578]:
randomFromst_model=RandomForestClassifier(n_estimators=grid_seach.best_params_['n_estimators'],
                                          max_depth=grid_seach.best_params_['max_depth'],
                                          min_samples_leaf=grid_seach.best_params_['min_samples_leaf'],
                                          min_samples_split=grid_seach.best_params_['min_samples_leaf'],
                                          max_features=grid_seach.best_params_['max_features']
                                          ).fit(x_train,y_train)

In [579]:
y_pred_rf=randomFromst_model.predict(x_test)

In [580]:
rf_scores=summarize_classification(y_test,y_pred_rf)

Test Data Count:  143
Accuracy Count:  111.0
Accuracy Score:  0.7762237762237763
Precision Score:  0.7755102040816326
Recall Score:  0.6440677966101694



## Hyperparameter Optimization for Logistic Regression

In [555]:
parameters= {
    'penalty':['l1','l2'],
    'C':[0.1,0.4,0.8,1,2,5]
    
}

grid_seach= GridSearchCV(LogisticRegression(solver='liblinear'),parameters,cv=3,return_train_score=True,verbose=1)
grid_seach.fit(x_train,y_train)

grid_seach.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


{'C': 2, 'penalty': 'l1'}

In [556]:
for i in range(10):
    print("parameters", grid_seach.cv_results_["params"][i])
    print("Mean Test Score", grid_seach.cv_results_['mean_test_score'][i])
    print("Rank", grid_seach.cv_results_['rank_test_score'][i])
    print("------------")

parameters {'C': 0.1, 'penalty': 'l1'}
Mean Test Score 0.7662211083263714
Rank 12
------------
parameters {'C': 0.1, 'penalty': 'l2'}
Mean Test Score 0.785547201336675
Rank 11
------------
parameters {'C': 0.4, 'penalty': 'l1'}
Mean Test Score 0.8013552399517311
Rank 10
------------
parameters {'C': 0.4, 'penalty': 'l2'}
Mean Test Score 0.8048918592778241
Rank 5
------------
parameters {'C': 0.8, 'penalty': 'l1'}
Mean Test Score 0.806618397846468
Rank 3
------------
parameters {'C': 0.8, 'penalty': 'l2'}
Mean Test Score 0.801364522417154
Rank 9
------------
parameters {'C': 1, 'penalty': 'l1'}
Mean Test Score 0.8048640118815557
Rank 7
------------
parameters {'C': 1, 'penalty': 'l2'}
Mean Test Score 0.8048825768124014
Rank 6
------------
parameters {'C': 2, 'penalty': 'l1'}
Mean Test Score 0.8101457347071381
Rank 1
------------
parameters {'C': 2, 'penalty': 'l2'}
Mean Test Score 0.8031003434512206
Rank 8
------------


In [557]:

logistic_model= LogisticRegression(solver='liblinear',
    penalty=grid_seach.best_params_['penalty'],C=grid_seach.best_params_["C"]
    ).fit(x_train,y_train)

In [558]:
y_perd_lr= logistic_model.predict(x_test)

In [559]:
lr_scores=summarize_classification(y_test,y_perd_lr)

Test Data Count:  143
Accuracy Count:  111.0
Accuracy Score:  0.7762237762237763
Precision Score:  0.7547169811320755
Recall Score:  0.6779661016949152



In [560]:
type(lr_scores)

set

## Comparing Models Performance

In [584]:
import pandas as pd
import plotly.express as px

# Define models and their scores
models = ['Logistic Regression', 'Decision Tree', 'Random Forest']
metrics = ['Accuracy', 'Precision', 'Recall']

# Scores for each model


# Create the DataFrame
data = {
    'Model': (
        ['Logistic Regression'] * len(metrics) +
        ['Decision Tree'] * len(metrics) +
        ['Random Forest'] * len(metrics)
    ),
    'Metric': metrics * len(models),
    'Score': list(lr_scores) + list(dt_scores) + list(rf_scores)
}

df = pd.DataFrame(data)

# Plotting
fig = px.bar(df, x='Metric', y='Score', color='Model',
             barmode='group', title='Model Comparison: Logistic Regression vs Decision Tree vs Random Forest',
             labels={'Score': 'Scores', 'Metric': 'Metrics'})

# Show the plot
fig.show()
