In [3]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [7]:
path_of_input_file = r'D:\kaggle_trials\AirQualityUCI\AirQualityUCI.xlsx'
df                 = pd.read_excel(path_of_input_file)
df.head(2)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [9]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[2:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.
</div>

In [11]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations
</div>

In [13]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['AH']])


<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use
</div>

In [15]:
Bayesian_ridge_grid  = {'n_iter' : hp.choice('n_iter',range(300,600)),
                        'alpha_1': hp.uniform('alpha_1',0.0,1.0),
                        'alpha_2': hp.uniform('alpha_2',0.0,1.0),
                        'lambda_1': hp.uniform('lambda_1',0.0,1.0),
                        'lambda_2': hp.uniform('lambda_2',0.0,1.0),
                     }

In [18]:
def hyperopt_train_test(params):
    reg = BayesianRidge(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, Bayesian_ridge_grid, algo=tpe.suggest, max_evals=100, trials=trials)
best_parameters = space_eval(Bayesian_ridge_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.63it/s, best loss: -0.9999999999995861]
The best parameter tuned on training set is given by :-  {'alpha_1': 0.7822556011911279, 'alpha_2': 0.0014753500365261268, 'lambda_1': 0.12542036284480557, 'lambda_2': 0.8048114623228889, 'n_iter': 333}


<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [21]:
model = BayesianRidge(**best_parameters)
model.fit(X_train, y_train)

BayesianRidge(alpha_1=0.7822556011911279, alpha_2=0.0014753500365261268,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=0.12542036284480557, lambda_2=0.8048114623228889,
              n_iter=333, normalize=False, tol=0.001, verbose=False)

In [22]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.9999999999999205
