<a href="https://colab.research.google.com/github/Fliptoss/ML_stuff/blob/main/GridSearch_and_RandomizedSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score
)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import uniform, loguniform
import time
from sklearn.datasets import make_classification

In [7]:
## Generating the sample data
X, y = make_classification(
    n_samples=1000,
    n_features = 20,
    n_informative=15,
    n_redundant=5,
    n_clusters_per_class=1,
    random_state=42
)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
X

array([[ 4.30608428,  6.58096869,  1.77636995, ..., -3.24487545,
         1.36578601,  4.03867557],
       [ 0.45429637, -1.01040476,  2.83570949, ...,  2.31966612,
         0.91989385, -2.43404866],
       [ 1.6020016 ,  4.75816061, -0.218513  , ...,  5.15717427,
         2.93540845,  3.45559073],
       ...,
       [-3.09273872, -1.92753842,  1.01951391, ..., -5.25415056,
        -6.02040514, -0.39299206],
       [ 0.69607833, -1.34298216, -2.01546885, ..., -0.16622791,
        -2.46167483,  4.07868872],
       [ 1.11039027,  0.17517092, -0.67808558, ...,  0.66990967,
         0.35022463,  3.93925633]])

In [11]:
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,

In [24]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(random_state=42, max_iter=1000))
])

In [31]:
param_grid = {
    'logistic_C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logistic_penalty': ['l1', 'l2', 'elasticnet'],
    'logistic_solver': ['liblinear', 'saga'],
    'logistic_l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  ## only used with elasticnet
}

In [32]:
## l1 ratio is only used when penalty is elasticnet
grid_l1_l2 = {
    'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logistic__penalty': ['l1', 'l2'],
    'logistic__solver': ['liblinear', 'saga']
}

grid_elasticnet = {
    'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logistic__penalty': ['elasticnet'],
    'logistic__solver': ['saga'],  # Only saga supports elasticnet
    'logistic__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

In [33]:
param_grids = [grid_l1_l2, grid_elasticnet]

In [34]:
start_time = time.time()
grid_search = GridSearchCV(
    pipeline,
    grid_l1_l2,  # Use the working grid
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [35]:
grid_search.fit(X_train, y_train)
grid_time = time.time() - start_time

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [36]:
print(f"Grid Search completed in {grid_time:.2f} seconds")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Grid Search completed in 6.74 seconds
Best parameters: {'logistic__C': 0.1, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Best cross-validation score: 0.9750


### Randomized SearchCV

In [37]:
param_distributions = {
    'logistic__C': loguniform(1e-3, 1e2),  # Log-uniform distribution
    'logistic__penalty': ['l1', 'l2'],
    'logistic__solver': ['liblinear', 'saga']
}

In [38]:
start_time = time.time()
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)



In [39]:
random_search.fit(X_train, y_train)
random_time = time.time() - start_time

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [40]:
print(f"Randomized Search completed in {random_time:.2f} seconds")
print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validation score: {random_search.best_score_:.4f}")

Randomized Search completed in 20.77 seconds
Best parameters: {'logistic__C': np.float64(0.4108318894699929), 'logistic__penalty': 'l1', 'logistic__solver': 'saga'}
Best cross-validation score: 0.9750


In [41]:
print(f"Grid Search time: {grid_time:.2f} seconds")
print(f"Randomized Search time: {random_time:.2f} seconds")

Grid Search time: 6.74 seconds
Randomized Search time: 20.77 seconds


In [42]:
print(f"Time saved with Randomized Search: {((grid_time - random_time) / grid_time * 100):.1f}%")

Time saved with Randomized Search: -208.2%


### Test set performance

In [43]:
grid_pred = grid_search.predict(X_test)
random_pred = random_search.predict(X_test)

In [44]:
from sklearn.metrics import accuracy_score
grid_accuracy = accuracy_score(y_test, grid_pred)
random_accuracy = accuracy_score(y_test, random_pred)

In [45]:
print(f"\nTest Set Performance:")
print(f"Grid Search Accuracy: {grid_accuracy:.4f}")
print(f"Randomized Search Accuracy: {random_accuracy:.4f}")


Test Set Performance:
Grid Search Accuracy: 0.9900
Randomized Search Accuracy: 0.9850
