In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [21]:
path_of_input_file = r'D:\kaggle_trials\weatherww2\Summary of Weather.csv'
cols2read = ['STA','WindGustSpd','MaxTemp','MinTemp','MeanTemp'
             ,'Snowfall','PoorWeather','YR','MO','DA','DR',
             'SPD','MAX','MIN','MEA']

df                 = pd.read_csv(path_of_input_file,usecols= cols2read)
df.head(2)

Unnamed: 0,STA,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,MO,DA,DR,SPD,MAX,MIN,MEA
0,10001,,25.555556,22.222222,23.888889,0,,42,7,1,,,78.0,72.0,75.0
1,10001,,28.888889,21.666667,25.555556,0,,42,7,2,,,84.0,71.0,78.0


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [22]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-11]
cols_needed.remove('MaxTemp')
possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('MaxTemp')
categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.
</div>

In [23]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations
</div>

In [24]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['MaxTemp']])


<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use
</div>

In [27]:
SGDR_grid   = {'loss' : hp.choice('loss',['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive']),
               'penalty' : hp.choice('penalty',['l2','l1','elasticnet','none']),
                'alpha': hp.uniform('alpha',0.0,1.0),
                'learning_rate': hp.choice('learning_rate',['constant','optimal','invscaling','adaptive'])
                        
                     }

In [28]:
def hyperopt_train_test(params):
    reg = SGDRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, SGDR_grid, algo=tpe.suggest, max_evals=100, trials=trials)
best_parameters = space_eval(SGDR_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 100/100 [03:09<00:00,  1.90s/it, best loss: -0.9939970686605907]
The best parameter tuned on training set is given by :-  {'alpha': 0.055081188680586174, 'learning_rate': 'adaptive', 'loss': 'squared_loss', 'penalty': 'none'}


<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [29]:
model = SGDRegressor(**best_parameters)
model.fit(X_train, y_train)

SGDRegressor(alpha=0.055081188680586174, average=False, early_stopping=False,
             epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='none', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.9947724644478078
