In [1]:
import pandas as pd

In [2]:
# read iris dataset
iris = pd.read_csv( r"C:\Users\acreddy\Desktop\abc\LogisticReg-Binary_multiclass\data\iris.csv")

In [3]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [6]:
# split target and independent variables
X= iris.iloc[:, :-1]
y= iris.iloc[:, -1]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# split the data in to train and test datasets
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.25, random_state=42)

In [9]:
X_train.shape,   y_train.shape

((112, 4), (112,))

In [10]:
X_test.shape, y_test.shape

((38, 4), (38,))

In [57]:
from sklearn.preprocessing import LabelEncoder

In [58]:
# initilize the lable encoder
encoder= LabelEncoder()

In [70]:
# fit_transform the y_train
y_train_encoded= encoder.fit_transform(y_train)

In [71]:
y_test_encoded= encoder.transform(y_test)

In [72]:
# to know which class assigned to which label
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

In [73]:
label_mapping

{'setosa': 0, 'versicolor': 1, 'virginica': 2}

# Hyper Param tuning using Combination of Random and GridSearchCV

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# from sklearn.pipeline import Pipeline
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [81]:
# create a pipeline with label-encoding the target and logistic reg model.
# model= Pipeline([
#     ("label_encoder", LabelEncoder()),
#     ("log_reg", LogisticRegression())
# ])
# this pipeline doesnt work as label_encoder doesnt work as StandardScaler() we need to explicitly encode outside 
#this pipeline or use ColumnTransformer method

In [82]:
# initialize LogisticRegression
log_reg= LogisticRegression()

In [83]:
# Define the params of logisitc regression in param distribution for randomized search cv.
param_dist_random= {
    "C": np.logspace(-4, 4, 20),
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "penalty": ["l1", "l2"],
    "multi_class": ["ovr", "multinomial"],
    "tol": [1e-4, 1e-3, 1e-2], # model will stop once the slopes values changes below than this difference.
    "max_iter": [50,100,200,500]
}

In [87]:
# Perform Randomized Search
random_search = RandomizedSearchCV(
    log_reg,
    param_distributions=param_dist_random,
    n_iter=10,  # Number of random samples
    cv=5,       # Cross-validation folds
    scoring="accuracy", # its classification problem so i have chosen accuracy we can chose f1_score, precision..etc.
    random_state=42
)

In [88]:
# Fit the random_search model to the train data
random_search.fit(X_train, y_train_encoded)

RandomizedSearchCV(cv=5, estimator=LogisticRegression(),
                   param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'max_iter': [50, 100, 200, 500],
                                        'multi_class': ['ovr', 'multinomial'],
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['newton-cg', 'lbfgs',
                                                   'liblinear', 'sag', 'saga'],
                                        'tol': [0.0001, 0.001, 0.01]},
                   random_state=42, scoring='accuracy')

In [93]:
# Get the best hyperparameters from Randomized Search
best_params_random_search = random_search.best_params_

In [94]:
best_params_random_search

{'tol': 0.0001,
 'solver': 'sag',
 'penalty': 'l2',
 'multi_class': 'ovr',
 'max_iter': 100,
 'C': 78.47599703514607}

In [95]:
# Define hyperparameter search space for Grid Search around the best parameters from Randomized Search
grid_param_grid = {
    "tol": [best_params_random_search["tol"]],
    "solver": [best_params_random_search["solver"]],
    "penalty": [best_params_random_search["penalty"]],
    "multi_class": [best_params_random_search["multi_class"]],
    "max_iter": np.arange(50, 500, 50), # i have taken all the best params from random_search_cv. except this one
    "C": [best_params_random_search["C"]],
}

In [97]:
# Perform Grid Search around the best parameters from Randomized Search
random_grid_search = GridSearchCV(
    log_reg, 
    param_grid=grid_param_grid,
    cv=5,       # Cross-validation folds
    scoring="accuracy",
    verbose=1
)

In [99]:
# Fit the grid_Search model to the train data
random_grid_search.fit(X_train, y_train_encoded)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [78.47599703514607],
                         'max_iter': array([ 50, 100, 150, 200, 250, 300, 350, 400, 450]),
                         'multi_class': ['ovr'], 'penalty': ['l2'],
                         'solver': ['sag'], 'tol': [0.0001]},
             scoring='accuracy', verbose=1)

In [100]:
# Get the best hyperparameters from Grid Search
best_params_grid = random_grid_search.best_params_

In [101]:
best_params_grid

{'C': 78.47599703514607,
 'max_iter': 100,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'solver': 'sag',
 'tol': 0.0001}

In [102]:
# Compare the results
print("Best Hyperparameters from Randomized Search:", best_params_random)
print("Best Hyperparameters from Grid Search:", best_params_grid)
print("Best R-squared Score: ", random_grid_search.best_score_)

Best Hyperparameters from Randomized Search: {'tol': 0.0001, 'solver': 'sag', 'penalty': 'l2', 'multi_class': 'ovr', 'max_iter': 100, 'C': 78.47599703514607}
Best Hyperparameters from Grid Search: {'C': 78.47599703514607, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'sag', 'tol': 0.0001}
Best R-squared Score:  0.9553359683794467


In [103]:
# Evaluate on the test set
best_model_grid = random_grid_search.best_estimator_
y_pred= best_model_grid.predict(X_test)

In [104]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [105]:
# lets see the metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
classification_report = classification_report(y_test_encoded, y_pred)

In [106]:
accuracy

1.0

In [107]:
conf_matrix

array([[15,  0,  0],
       [ 0, 11,  0],
       [ 0,  0, 12]], dtype=int64)

In [108]:
print(classification_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        12

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

