In [12]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression,Lars,ElasticNet,Lasso,Ridge,OrthogonalMatchingPursuit,Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [2]:
path_of_input_file = r'D:\kaggle_trials\house-prices-advanced-regression-techniques\train.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


<div class="alert alert-block alert-info">
<b>Categorical and Numerical Columns Identification:</b> We identify categorical and numerical columns from the data. We do set a threshold that if any categorical value is classified as numerical, then it has to be classified back to categorical if the number of distinct values of that column in the dataframe is less than 10 
</div>

In [3]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('Id')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing Value Treatment:</b> We impute the categorical missing values with their mode and the numerical missing values with their mean
</div>

In [4]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Encoding and Feature Scaling:</b> We do the one hot encoding of categorical values and scale(by using MinMaxScaler) the numerical values to get the final feature matrix X. Subseqently, we consider the SalePrice column to be our target variable
</div>

In [5]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['SalePrice']])


<div class="alert alert-block alert-info">
<b>Train Test split:</b> We perform train test split on the data
</div>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Orthogonal MatchinG Pursuit Model:</b> We fit OMP model on the data to get the results
</div>

In [10]:
orth_reg                = OrthogonalMatchingPursuit()
orth_reg.fit(X_train,y_train)
y_pred_omp              = orth_reg.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred_omp,y_test))

The coefficient of determination is:-  1.0


<div class="alert alert-block alert-info">
<b>LARS:</b> We fit a LARS model on the data to get the results
</div>

In [11]:
lars_reg                = Lars()
lars_reg.fit(X_train,y_train)
y_pred_lars             = lars_reg.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred_lars,y_test))

The coefficient of determination is:-  1.0


<div class="alert alert-block alert-info">
<b>Lasso model:</b> We fit a LASSO model on the data to get the results
</div>

In [19]:
lasso_reg_grid  = {'alpha' : hp.uniform('alpha',0.6,5),
                      'precompute' : hp.choice('precompute',[True,False])
                     }
def hyperopt_train_test(params):
    reg = Lasso(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, lasso_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(lasso_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

lasso_reg                = Lasso(**best_parameters)
lasso_reg.fit(X_train,y_train)
y_pred_lasso             = lasso_reg.predict(X_test)
print('The coefficient of determination is:- ',round(float(r2_score(y_pred_lasso,y_test)),3))

100%|████████████████████████████████████████████████| 30/30 [00:04<00:00,  6.45it/s, best loss: 0.0022020148876461163]
The best parameter tuned on training set is given by :-  {'alpha': 2.7498496465651137, 'precompute': True}
The coefficient of determination is:-  0.0


<div class="alert alert-block alert-info">
<b>Ridge Fit:</b> We fit a Ridge regression model on the data to get the results
</div>

In [20]:
ridge_reg_grid  = {'alpha' : hp.uniform('alpha',0.01,5),
                      'solver'          : hp.choice('solver',['auto','svd','cholesky','lsqr','sparse_cg','sag','saga'])
                     }
def hyperopt_train_test(params):
    reg = Ridge(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, ridge_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(ridge_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

ridge_reg = Ridge(**best_parameters)
ridge_reg.fit(X_train, y_train)

y_pred = ridge_reg.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

100%|██████████████████████████████████████████████████| 30/30 [00:15<00:00,  6.63it/s, best loss: -0.9024268516838202]
The best parameter tuned on training set is given by :-  {'alpha': 0.06713438499242544, 'solver': 'sparse_cg'}
The coefficient of determination is:-  0.9554844749812081


<div class="alert alert-block alert-info">
<b>Elastic Net Fit:</b> We fit an Elastic NET model on the data to get the results
</div>

In [21]:
elastic_net_grid  = {'alpha' : hp.uniform('alpha',0.01,5),
                      'l1_ratio': hp.uniform('l1_ratio',0.0,1.0),
                     'precompute':hp.choice('precompute',[True,False])
                     }
def hyperopt_train_test(params):
    reg = ElasticNet(**params,random_state=19)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, elastic_net_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(elastic_net_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)
model = ElasticNet(**best_parameters,random_state=19)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

100%|████████████████████████████████████████████████| 30/30 [00:03<00:00,  7.53it/s, best loss: 0.0022020148876461163]
The best parameter tuned on training set is given by :-  {'alpha': 2.7594581327744745, 'l1_ratio': 0.5878606959326904, 'precompute': True}
The coefficient of determination is:-  0.0


<div class="alert alert-block alert-info">
<b>Conclusions :</b> We can clearly see that OMP and LARS gave very good R^2 score and Ridge regression was close to 1 as well. 
</div>