In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.linear_model import LassoCV

In [2]:
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,0.466502,0.446945,-0.055606,-0.13327,0.064238,0.243025,0.701291,0.306146,0.02618,...,-0.069393,0.063305,0.420422,0.175481,-0.190481,-1.909325,-0.929403,0.294331,0.406244,12.247699
1,2.0,0.785216,0.446945,0.596835,0.113413,0.064238,0.243025,0.701291,0.306146,0.02618,...,-0.069393,0.063305,0.420422,0.175481,-0.190481,-0.279901,0.569137,0.294331,0.406244,12.109016
2,3.0,0.466502,0.446945,0.08601,0.420049,0.064238,0.243025,-1.016637,0.306146,0.02618,...,-0.069393,0.063305,0.420422,0.175481,-0.190481,0.920928,-0.929403,0.294331,0.406244,12.317171
3,4.0,-1.12707,0.446945,-0.306589,0.103317,0.064238,0.243025,-1.016637,0.306146,0.02618,...,-0.069393,0.063305,0.420422,0.175481,-0.190481,-1.909325,-0.180133,0.294331,-2.200862,11.849405
4,5.0,0.466502,0.446945,0.750399,0.878431,0.064238,0.243025,-1.016637,0.306146,0.02618,...,-0.069393,0.063305,0.420422,0.175481,-0.190481,1.537684,-0.929403,0.294331,0.406244,12.42922


In [4]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,0.806893,-3.596122,0.64048,0.561787,0.06426,0.264086,0.703009,0.310969,0.03705,...,2.844271,-0.063967,0.043675,-0.919649,0.176882,-0.188227,0.162759,-1.704618,0.295066,0.393203
1,1462,0.806893,0.463776,0.678094,0.970407,0.06426,0.264086,-1.090156,0.310969,0.03705,...,-0.324992,-0.063967,0.043675,0.434304,-8.722107,7.33323,0.162759,-1.704618,0.295066,0.393203
2,1463,0.477497,0.463776,0.404558,0.908413,0.06426,0.264086,-1.090156,0.310969,0.03705,...,-0.324992,-0.063967,0.043675,-0.919649,0.176882,-0.188227,-1.100777,-1.704618,0.295066,0.393203
3,1464,0.477497,0.463776,0.563839,0.257857,0.06426,0.264086,-1.090156,0.310969,0.03705,...,-0.324992,-0.063967,0.043675,0.434304,0.176882,-0.188227,0.162759,-1.704618,0.295066,0.393203
4,1465,-0.181296,0.463776,-1.230253,-1.116969,0.06426,0.264086,-1.090156,-3.100336,0.03705,...,2.963846,-0.063967,0.043675,0.434304,0.176882,-0.188227,-2.665809,-1.704618,0.295066,0.393203


In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Extracting data from Dataframes to numpy arrays

In [6]:
y = train['SalePrice'].values
x_train = train.drop(columns=['Id','SalePrice']).values
x_test = test.drop(columns='Id').values

# Lasso model

In [7]:
lasso_model = Lasso()
parameters_lasso = {'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}

In [8]:
lasso_reg = GridSearchCV(lasso_model, 
                         parameters_lasso, 
                         cv=kf, 
                         scoring='neg_mean_squared_error',
                         refit=True,
                         n_jobs=-1)
lasso_reg.fit(x_train,y);

In [9]:
print('Lasso best score: ', lasso_reg.best_score_)

Lasso best score:  -0.019741121739801377


In [10]:
print('Lasso best parameters: ', lasso_reg.best_params_)

Lasso best parameters:  {'alpha': 0.01}


In [11]:
lasso_model = lasso_reg.best_estimator_
lasso_predictions = lasso_model.predict(x_test)
lasso_predictions = np.expm1(lasso_predictions)

# XGB Model

In [12]:
xgb_model = GradientBoostingRegressor()
parameters_xgb = {'n_estimators': np.arange(10,x_train.shape[1],20),
              'max_depth': np.arange(2, 5, 1),
              'max_features': ['auto', 'sqrt', 1/3]}

In [13]:
xgb_reg = GridSearchCV(xgb_model, 
                   parameters_xgb, 
                   cv=kf, 
                   scoring='neg_mean_squared_error',
                   refit=True,
                   n_jobs=-1)
xgb_reg.fit(x_train,y);

In [14]:
print('XGB best score: ', xgb_reg.best_score_)

XGB best score:  -0.016882422120667092


In [15]:
print('XGB best parameters: ', xgb_reg.best_params_)

XGB best parameters:  {'max_depth': 4, 'max_features': 'auto', 'n_estimators': 70}


In [16]:
xgb_model = xgb_reg.best_estimator_
xgb_predictions = xgb_model.predict(x_test)
xgb_predictions = np.expm1(xgb_predictions)

# Blending models' predictions and saving results

In [17]:
predictions = (lasso_predictions + xgb_predictions) / 2
predictions

array([118920.79905943, 156125.82479365, 181337.66860331, ...,
       169037.76638142, 119701.84966713, 242036.58909756])

In [18]:
predictions = pd.DataFrame({'Id' : test['Id'], 'SalePrice' : predictions})
predictions.to_csv('data/submission.csv', index=False)