Title: GridSearchCV & RandomizedSearchCV

Task 1: GridSearchCV for Decision Trees<br>
Use GridSearchCV to tune max_depth and min_samples_split in Decision Tree for Iris.

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, None],
    'min_samples_split': [2, 5, 10, 15]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found:", grid_search.best_params_)

# Predict on test data using best estimator
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters found: {'max_depth': 4, 'min_samples_split': 10}
Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



Task 2: RandomizedSearchCV for Random Forest<br>
Apply RandomizedSearchCV to optimize hyperparameters of Random Forest for customer churn.

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from scipy.stats import randint

# --- Simulate sample customer churn dataset ---
# Replace this part with your real dataset loading
np.random.seed(42)
X = np.random.rand(1000, 10)  # 1000 samples, 10 features
y = np.random.choice([0, 1], size=1000, p=[0.7, 0.3])  # Binary churn: 0=no churn, 1=churn

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter space for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,  # number of random trials
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters:", random_search.best_params_)

# Predict and evaluate on test set
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=9, max_features=None, min_samples_leaf=8, min_samples_split=14, n_estimators=70; total time=   0.6s
[CV] END max_depth=9, max_features=None, min_samples_leaf=8, min_samples_split=14, n_estimators=70; total time=   0.5s
[CV] END max_depth=9, max_features=None, min_samples_leaf=8, min_samples_split=14, n_estimators=70; total time=   0.5s
[CV] END max_depth=9, max_features=None, min_samples_leaf=8, min_samples_split=14, n_estimators=70; total time=   0.6s
[CV] END max_depth=9, max_features=None, min_samples_leaf=8, min_samples_split=14, n_estimators=70; total time=   0.4s
[CV] END max_depth=9, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=124; total time=   0.4s
[CV] END max_depth=9, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=124; total time=   0.4s
[CV] END max_depth=9, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=124; total

40 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
 

Best parameters: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 183}
              precision    recall  f1-score   support

           0       0.68      1.00      0.81       203
           1       0.00      0.00      0.00        97

    accuracy                           0.68       300
   macro avg       0.34      0.50      0.40       300
weighted avg       0.46      0.68      0.55       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Task 3: Fine-Tuning SVR with GridSearchCV<br>
Use GridSearchCV to find best parameters for Support Vector Regression on housing data.

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset (California Housing as Boston Housing is deprecated)
data = fetch_california_housing()
X, y = data.data, data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create pipeline: scaling + SVR
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Parameter grid for GridSearchCV
param_grid = {
    'svr__kernel': ['linear', 'rbf', 'poly'],
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto'],
    'svr__degree': [2, 3, 4]  # only relevant for 'poly' kernel
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Predict on test set
y_pred = grid_search.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R^2 score: {r2:.4f}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=linear; total time=   6.7s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=linear; total time=   6.9s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=linear; total time=   6.6s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=linear; total time=   6.5s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=linear; total time=   6.9s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=rbf; total time=   8.4s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=rbf; total time=   8.0s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=rbf; total time=   7.8s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=rbf; total time=   7.8s
[CV] END svr__C=0.1, svr__degree=2, svr__gamma=scale, svr__kernel=rbf; total time=   8.1s
[CV] END svr__C=0.1, sv