In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
from math import sqrt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Read data

In [3]:
df = pd.read_csv('train_one_hot.csv')
feature, target = df, df.pop('SalePrice')

In [4]:
feature.head()

Unnamed: 0.1,Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,"GarageYrBlt_age_class_(-200.314, -168.6]","GarageYrBlt_age_class_(-168.6, -137.2]","GarageYrBlt_age_class_(-137.2, -105.8]","GarageYrBlt_age_class_(-105.8, -74.4]","GarageYrBlt_age_class_(-74.4, -43]","GarageYrBlt_age_class_(-43, -11.6]","GarageYrBlt_age_class_(-11.6, 19.8]","GarageYrBlt_age_class_(19.8, 51.2]","GarageYrBlt_age_class_(51.2, 82.6]","GarageYrBlt_age_class_(82.6, 114]"
0,0,1,4.189655,9.04204,7,5,2003,2003,196.0,4,...,0,0,0,0,0,0,1,0,0,0
1,1,2,4.394449,9.169623,6,8,1976,1976,0.0,3,...,0,0,0,0,0,0,0,1,0,0
2,2,3,4.234107,9.328212,7,5,2001,2002,162.0,4,...,0,0,0,0,0,0,1,0,0,0
3,3,4,4.110874,9.164401,7,5,1915,1970,0.0,3,...,0,0,0,0,0,0,1,0,0,0
4,4,5,4.442651,9.565284,8,5,2000,2000,350.0,4,...,0,0,0,0,0,0,1,0,0,0


In [5]:
feature = feature.drop('Id',axis=1)
feature = feature.drop('Unnamed: 0', axis=1)

In [6]:
feature.shape

(1460, 314)

# 1. Fit Lasso model

In [7]:
ls2 = Lasso(alpha=0.001)
ls2.fit(feature, target)
predictL3 = ls2.predict(feature)

print('Lasso Regression Model:\n')
print('MSE =', sqrt(mean_squared_error(target, predictL3)))

Lasso Regression Model:

MSE = 0.11534953161995387


# 2. Fit Ridge

In [8]:
rg2 = Ridge(alpha=10)
rg2.fit(feature, target)
predictR2 = rg2.predict(feature)

print('Ridge Regression Model:')
print('MSE =', sqrt(mean_squared_error(target, predictR2)))

Ridge Regression Model:
MSE = 0.105979732900822


# 3. Fit ElasticNet

In [9]:
en2 = ElasticNet(alpha=0.01,l1_ratio=0.01)
en2.fit(feature, target)
predictE4 = en2.predict(feature)

print('ElasticNet Regression Model:\n')
print('MSE =', sqrt(mean_squared_error(target, predictE4)))

ElasticNet Regression Model:

MSE = 0.10961564970466742


# Merge output

In [10]:
step1_target = pd.DataFrame({'Lasso':ls2.predict(feature),
                            'Ridge':rg2.predict(feature),
                            'ElasticNet':en2.predict(feature)})

In [11]:
step1_target.shape

(1460, 3)

# Model Ensembling

In [12]:
xgbr = xgb.XGBRegressor(colsample_bytree=0.5,learning_rate=0.1,max_depth=5,min_child_weight=1,subsample=0.6)
xgbr.fit(step1_target, target)
predict_xgb = xgbr.predict(step1_target)

print('XGBoosting Regression Model:\n')
print('MSE =', sqrt(mean_squared_error(target, predict_xgb)))

XGBoosting Regression Model:

MSE = 0.08657098890954962
Training Score: 0.9530


# Predict data

In [13]:
test_data = pd.read_csv('test_one_hot.csv')
test_id = test_data['Id']
test_data = test_data[test_data.columns[2:]]

In [14]:
null_check = test_data.isnull().sum()
null_check[null_check>0]

Series([], dtype: int64)

In [15]:
test_step1_target = pd.DataFrame({'Lasso':ls2.predict(test_data),
                            'Ridge':rg2.predict(test_data),
                            'ElasticNet':en2.predict(test_data)})

In [16]:
predict_price = xgbr.predict(test_step1_target)
predict_price = np.expm1(predict_price)

In [17]:
submit = pd.DataFrame({'Id':test_id,'SalePrice':predict_price})
submit.to_csv('submit.csv', index=False)