In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.linear_model import LassoCV

In [2]:
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,0.471285,0.446945,-0.078896,-0.13327,0.064238,0.244717,0.701291,0.311112,0.02618,...,-0.069393,0.064907,0.420422,0.171671,-0.190481,-1.909325,-0.928852,0.292103,0.406244,12.247699
1,2.0,0.785189,0.446945,0.572719,0.113413,0.064238,0.244717,0.701291,0.311112,0.02618,...,-0.069393,0.064907,0.420422,0.171671,-0.190481,-0.279901,1.328846,0.292103,0.406244,12.109016
2,3.0,0.471285,0.446945,0.062541,0.420049,0.064238,0.244717,-1.016637,0.311112,0.02618,...,-0.069393,0.064907,0.420422,0.171671,-0.190481,0.920928,-0.928852,0.292103,0.406244,12.317171
3,4.0,-0.470425,0.446945,-0.329561,0.103317,0.064238,0.244717,-1.016637,0.311112,0.02618,...,-0.069393,0.064907,0.420422,0.171671,-0.190481,-1.909325,-0.176286,0.292103,-2.200862,11.849405
4,5.0,0.471285,0.446945,0.726089,0.878431,0.064238,0.244717,-1.016637,0.311112,0.02618,...,-0.069393,0.064907,0.420422,0.171671,-0.190481,1.537684,-0.928852,0.292103,0.406244,12.42922


In [4]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,0.502726,-3.596122,0.623823,0.561787,0.06426,0.269555,0.703009,0.305442,0.03705,...,2.844271,-0.063967,0.042791,-0.919649,0.167895,-0.188227,0.162759,-1.687806,0.296667,0.393203
1,1462,0.502726,0.463776,0.66141,0.970407,0.06426,0.269555,-1.090156,0.305442,0.03705,...,-0.324992,-0.063967,0.042791,0.434304,-7.997418,7.33323,0.162759,-1.687806,0.296667,0.393203
2,1463,0.292017,0.463776,0.388069,0.908413,0.06426,0.269555,-1.090156,0.305442,0.03705,...,-0.324992,-0.063967,0.042791,-0.919649,0.167895,-0.188227,-1.100777,-1.687806,0.296667,0.393203
3,1464,0.292017,0.463776,0.547237,0.257857,0.06426,0.269555,-1.090156,0.305442,0.03705,...,-0.324992,-0.063967,0.042791,0.434304,0.167895,-0.188227,0.162759,-1.687806,0.296667,0.393203
4,1465,-0.1294,0.463776,-1.245577,-1.116969,0.06426,0.269555,-1.090156,-1.477119,0.03705,...,2.963846,-0.063967,0.042791,0.434304,0.167895,-0.188227,-2.665809,-1.687806,0.296667,0.393203


In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Extracting data from Dataframes to numpy arrays

In [6]:
y = train['SalePrice'].values
x_train = train.drop(columns=['Id','SalePrice']).values
x_test = test.drop(columns='Id').values

# Lasso model

In [7]:
lasso_model = Lasso()
parameters_lasso = {'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}

In [8]:
lasso_reg = GridSearchCV(lasso_model, 
                         parameters_lasso, 
                         cv=kf, 
                         scoring='neg_mean_squared_error',
                         refit=True,
                         n_jobs=-1)
lasso_reg.fit(x_train,y);

In [9]:
print('Lasso best score: ', lasso_reg.best_score_)

Lasso best score:  -0.019695435092210604


In [10]:
print('Lasso best parameters: ', lasso_reg.best_params_)

Lasso best parameters:  {'alpha': 0.001}


In [11]:
lasso_model = lasso_reg.best_estimator_
lasso_predictions = lasso_model.predict(x_test)
lasso_predictions = np.expm1(lasso_predictions)

# XGB Model

In [12]:
xgb_model = GradientBoostingRegressor()
parameters_xgb = {'n_estimators': np.arange(10,x_train.shape[1],20),
              'max_depth': np.arange(2, 5, 1),
              'max_features': ['auto', 'sqrt', 1/3]}

In [13]:
xgb_reg = GridSearchCV(xgb_model, 
                   parameters_xgb, 
                   cv=kf, 
                   scoring='neg_mean_squared_error',
                   refit=True,
                   n_jobs=-1)
xgb_reg.fit(x_train,y);

In [14]:
print('XGB best score: ', xgb_reg.best_score_)

XGB best score:  -0.01748680547822181


In [15]:
print('XGB best parameters: ', xgb_reg.best_params_)

XGB best parameters:  {'max_depth': 4, 'max_features': 'auto', 'n_estimators': 70}


In [16]:
xgb_model = xgb_reg.best_estimator_
xgb_predictions = xgb_model.predict(x_test)
xgb_predictions = np.expm1(xgb_predictions)

# Blending models' predictions and saving results

In [17]:
predictions = (lasso_predictions + xgb_predictions) / 2
predictions

array([118430.26883592, 149850.56854487, 184761.58575159, ...,
       169537.25902658, 121408.01549731, 242989.7595048 ])

In [18]:
predictions = pd.DataFrame({'Id' : test['Id'], 'SalePrice' : predictions})
predictions.to_csv('data/submission.csv', index=False)