In [3]:
import pandas as pd

In [4]:
cars= pd.read_csv("auto_mpg_preprocessed.csv")

In [5]:
cars.head(2)

Unnamed: 0,symboling,wheel_base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0


In [6]:
X= cars.iloc[:, :-1]
y= cars.iloc[:, -1]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.25, random_state=42)

In [9]:
len(X_train), len(y_train)

(153, 153)

In [10]:
len(X_test), len(y_test)

(52, 52)

# Hyper Param Tuning using GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

In [12]:
# create a pipeline with Scaling and LinearReg model, so that i can call pipeline together.
model = Pipeline([
    ("scaler", StandardScaler()),
    ("regression", Ridge())
])

In [13]:
# Define the param grid for Standard scaler and Ridge Reg model
param_grid = {
    "regression__alpha": [0.1, 1.0, 10.0], # regularization strength
    "scaler__with_mean": [True],  # always center the data
    "scaler__with_std": [True],   # always scale the data, i,e both combine standardize the data
    "regression__fit_intercept": [True, False],
}

Note: 

    1. Here I wanted standard scaling happen all the time so thats why i have passed True for both with_mean and with_std.
       I have used the Ridge as we have hyper param alpha in it. You can use Simple Linear Reg or Lasso as well.
    
    2. We can use normalize instead of StandardScaler(), in this case you dont need pipeline as in the linear regression 
       itself has the param normalize. You just have to pass "normalize: [True, Pass] in param_grid.
       
    3. I have given double underscore as pram_grid expects such notation. ex: scaler__with_mean. when you didnt define 
       pipeline  you can simply give "fit_intercept": [Tru, False] and "noramlize": [Tru, False]
       
    4. GridSearchCV will try all the combinations like regression_alpha: 0.1, scaler__with_mean: True, Scaler__with_std: 
       False regression__fit_ointercept: True ....etc its will all the combinations.  Thats why i just kept True for both 
       mean and std as I wanted stadardscaler should happen all the time. 
     

In [14]:
# Initialize GridSearchCV with R-squared as the scoring metric and definied param_grid and crossvalidation=5
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1, return_train_score=True)

Note: I have passed scoring metric = r2. You can give  "neg_mean_squared_error" also. As GridSearchCV maximises the 
    scoring metric. So if we want a good score we must reduce the error so we must pass that negative term to loss function

In [15]:
# fit the train data to grid_Search
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('regression', Ridge())]),
             param_grid={'regression__alpha': [0.1, 1.0, 10.0],
                         'regression__fit_intercept': [True, False],
                         'scaler__with_mean': [True],
                         'scaler__with_std': [True]},
             return_train_score=True, scoring='r2', verbose=1)

In [16]:
# Print the best parameters and corresponding R-squared score
print("Best Parameters: ", grid_search.best_params_)
print("Best R-squared Score: ", grid_search.best_score_)

Best Parameters:  {'regression__alpha': 10.0, 'regression__fit_intercept': True, 'scaler__with_mean': True, 'scaler__with_std': True}
Best R-squared Score:  0.7120270930412264


In [17]:
# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [18]:
from sklearn.metrics import r2_score, mean_squared_error

In [19]:
r2 = r2_score(y_test, y_pred)

In [20]:
r2

0.7709018468949935

Note: The R-squared score (0.68) obtained during cross-validation gives an estimate of how well the model performs on average across different validation sets from the training data.
The R-squared score (0.79) on the test set gives an indication of how well the model generalizes to new, unseen data.

In [21]:
#grid_search.cv_results_

# Hyper Param Tuning using RandomizedSearchCV

In [22]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [23]:
# Define the param grid for Standard scaler and Ridge Reg model
param_dist = {
    "regression__alpha": np.logspace(-3, 3, 100),  # Randomly sample from a logarithmic scale.
    "scaler__with_mean": [True],  # always center the data.
    "scaler__with_std": [True],   # always scale the data, i,e both combine standardize the data.
    "regression__fit_intercept": [True, False],
}

Note: 
    
      1. In GridSearchCV exhaustively searches through all possible combinations of hyperparameter values provided in 
         the param_grid. Where as RandomizedSearchCV has n_iter and it will all the given number combinations in the n_iter.
        
      2. In grid search, you manually specify a finite set of values to try. However, when using randomized search,
         you can't try every possible value. By using np.logspace(-3, 3, 100), you are essentially saying 
        "try values from 10^(-3) to 10^(3)" with 100 samples in between. This allows you to cover a broad range of potential
         values without explicitly specifying each one.
        

In [24]:
# Initialize RandomizedSearchCV with neg_mean_squared_error as the scoring metric.
# I have used neg_mean_squared_error instead r2. You can use r2 if you want.
random_search = RandomizedSearchCV(model, param_distributions= param_dist, n_iter= 10, cv= 5, 
                                   scoring= "neg_mean_squared_error", verbose= 1, random_state=42)


In [25]:
# fit the train data to random_search
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('regression', Ridge())]),
                   param_distributions={'regression__alpha': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13...
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03]),
                                        'regression__fit_intercept': [True,
                                                                      False],
                                        'scaler__with_mean': [True],
              

In [26]:
# Print the best parameters and corresponding R-squared score
print("Best Parameters: ", random_search.best_params_)
print("Best negative mean squared error: ", random_search.best_score_)

Best Parameters:  {'scaler__with_std': True, 'scaler__with_mean': True, 'regression__fit_intercept': True, 'regression__alpha': 61.35907273413176}
Best negative mean squared error:  -17206989.057663754


In [27]:
# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [28]:
r2_random_cv = r2_score(y_test, y_pred)

In [29]:
r2_random_cv

0.7395536735631596

# Combination of RandomizedSearchCV and GridSearchCV

In [30]:
from scipy.stats import uniform

In [31]:
# Define the param grid for random_search defining Standard scaler and Ridge Reg params.
param_dist_random = {
    "regression__alpha": uniform(0.1, 10),  # uniform dist for alpha, i,e take alpha from uniform dist.
    "scaler__with_mean": [True],  # always center the data.
    "scaler__with_std": [True],   # always scale the data, i,e both combine standardize the data.
    "regression__fit_intercept": [True, False],
}

In [32]:
# define model
model = Pipeline([
    ("scaler", StandardScaler()),
    ("regression", Ridge())
])

In [33]:
# Perform Randomized Search
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist_random,
    n_iter=10,  # Number of random samples
    cv=5,       # Cross-validation folds
    scoring="r2",
    random_state=42
)

In [34]:
# Fit the random_search model to the train data
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('regression', Ridge())]),
                   param_distributions={'regression__alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000002B4A707E880>,
                                        'regression__fit_intercept': [True,
                                                                      False],
                                        'scaler__with_mean': [True],
                                        'scaler__with_std': [True]},
                   random_state=42, scoring='r2')

In [35]:
# Get the best hyperparameters from Randomized Search
best_params_random = random_search.best_params_

In [36]:
best_params_random

{'regression__alpha': 6.086584841970366,
 'regression__fit_intercept': True,
 'scaler__with_mean': True,
 'scaler__with_std': True}

In [37]:
# Define hyperparameter search space for Grid Search around the best parameters from Randomized Search
param_grid_grid = {
    "regression__alpha": [best_params_random["regression__alpha"] - 1.0, 
                             best_params_random["regression__alpha"], best_params_random["regression__alpha"] + 1.0],
    "scaler__with_mean": [best_params_random["scaler__with_mean"]],# Anyhow i kept mean and std constant as True i can take those values from random_search as it is.
    "scaler__with_std": [best_params_random["scaler__with_std"]],
    "regression__fit_intercept": [True, False],  # since its not a constant like only True or False, so i am not taking from random_search 
}


In [38]:
# Perform Grid Search around the best parameters from Randomized Search
random_grid_search = GridSearchCV(
                        model, 
                        param_grid=param_grid_grid,
                        cv=5,       # Cross-validation folds
                        scoring="r2",
                        verbose=1
                        )

In [39]:
# Fit the grid_Search model to the train data
random_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('regression', Ridge())]),
             param_grid={'regression__alpha': [5.086584841970366,
                                               6.086584841970366,
                                               7.086584841970366],
                         'regression__fit_intercept': [True, False],
                         'scaler__with_mean': [True],
                         'scaler__with_std': [True]},
             scoring='r2', verbose=1)

In [40]:
# Get the best hyperparameters from Grid Search
best_params_grid = random_grid_search.best_params_

In [41]:
best_params_grid

{'regression__alpha': 7.086584841970366,
 'regression__fit_intercept': True,
 'scaler__with_mean': True,
 'scaler__with_std': True}

In [42]:
# Compare the results
print("Best Hyperparameters from Randomized Search:", best_params_random)
print("Best Hyperparameters from Grid Search:", best_params_grid)
print("Best R-squared Score: ", random_grid_search.best_score_)

Best Hyperparameters from Randomized Search: {'regression__alpha': 6.086584841970366, 'regression__fit_intercept': True, 'scaler__with_mean': True, 'scaler__with_std': True}
Best Hyperparameters from Grid Search: {'regression__alpha': 7.086584841970366, 'regression__fit_intercept': True, 'scaler__with_mean': True, 'scaler__with_std': True}
Best R-squared Score:  0.7116310175135241


In [43]:
# Evaluate on the test set
best_model_grid = grid_search.best_estimator_
y_pred= best_model_grid.predict(X_test)

In [44]:
r2_random_grid_cv = r2_score(y_test, y_pred)

In [45]:
r2_random_grid_cv

0.7709018468949935

# Bayesian Optimization for Hyper Param Tuning

In [46]:
from skopt import BayesSearchCV


In [47]:
# Define the parameter search space for Bayesian Optimization
param_space_bayes = {
    "regression__alpha": np.logspace(-3, 3, 100),  # Randomly sample from a logarithmic scale.
    "scaler__with_mean": [True],  # always center the data.
    "scaler__with_std": [True],   # always scale the data, i,e both combine standardize the data.
    "regression__fit_intercept": [True, False],
    
}

In [48]:
# Initialize BayesSearchCV
bayes_search = BayesSearchCV(model, param_space_bayes, n_iter=25, cv=5, scoring="r2", verbose=1)


In [49]:
# Fit the bayes model for train data
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


BayesSearchCV(cv=5,
              estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                        ('regression', Ridge())]),
              n_iter=25, scoring='r2',
              search_spaces={'regression__alpha': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669...
       7.05480231e+01, 8.11130831e+01, 9.32603347e+01, 1.07226722e+02,
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03]),
                             'regression__fit_intercept': [True, False],
                             'scaler__with_mean': [True],
                             'scale

In [50]:
# Print the best parameters and corresponding R-squared score
print("Best Parameters: ", bayes_search.best_params_)
print("Best R-squared Score: ", bayes_search.best_score_)

Best Parameters:  OrderedDict([('regression__alpha', 40.37017258596558), ('regression__fit_intercept', True), ('scaler__with_mean', True), ('scaler__with_std', True)])
Best R-squared Score:  0.7154874558608962


In [51]:
# Evaluate on the test set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)

In [52]:
r2_bayes = r2_score(y_test, y_pred)

In [53]:
r2_bayes

0.7470632755710174