In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
warnings.filterwarnings('ignore')
%matplotlib inline

In [54]:
data = pd.read_csv('Data/train.csv')

In [55]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


In [56]:
data = data.drop((missing_data[missing_data['Total'] > 1]).index,1)
data = data.drop(data.loc[data['Electrical'].isnull()].index)
data.isnull().sum().max()

0

In [57]:
data.sort_values(by = 'GrLivArea', ascending = False)[:2]
data = data.drop(data[data['Id'] == 1299].index)
data = data.drop(data[data['Id'] == 524].index)

In [58]:
data['SalePrice'] = np.log(data['SalePrice'])

In [59]:
data['GrLivArea'] = np.log(data['GrLivArea'])

In [60]:
data['HasBsmt'] = pd.Series(len(data['TotalBsmtSF']), index=data.index)
data['HasBsmt'] = 0 
data.loc[data['TotalBsmtSF']>0,'HasBsmt'] = 1
data.loc[data['HasBsmt']==1,'TotalBsmtSF'] = np.log(data['TotalBsmtSF'])

In [61]:
data = pd.get_dummies(data)

In [62]:
train = data[['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt','YearRemodAdd','Fireplaces','BsmtFinSF1']]

In [63]:
train.head()

Unnamed: 0,SalePrice,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,FullBath,YearBuilt,YearRemodAdd,Fireplaces,BsmtFinSF1
0,12.247694,7,7.444249,2,6.75227,2,2003,2003,0,706
1,12.109011,6,7.140453,2,7.140453,2,1976,1976,1,978
2,12.317167,7,7.487734,2,6.824374,2,2001,2002,1,486
3,11.849398,7,7.448334,3,6.628041,1,1915,1970,1,216
4,12.429216,8,7.695303,3,7.04316,2,2000,2000,1,655


In [64]:
y = train.SalePrice
X = train.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

In [65]:
my_model = XGBRegressor(n_estimators= 1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [66]:
predictions = my_model.predict(test_X)

In [67]:
mean_squared_error(predictions, test_y)

0.015858926748498457

In [68]:
test_data = pd.read_csv('Data/test.csv')

In [69]:
test_data['GrLivArea'] = np.log(test_data['GrLivArea'])
test_data['HasBsmt'] = pd.Series(len(test_data['TotalBsmtSF']), index=test_data.index)
test_data['HasBsmt'] = 0 
test_data.loc[test_data['TotalBsmtSF']>0,'HasBsmt'] = 1
test_data.loc[test_data['HasBsmt']==1,'TotalBsmtSF'] = np.log(test_data['TotalBsmtSF'])
test = test_data[['OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt','YearRemodAdd','Fireplaces','BsmtFinSF1']]

In [70]:
test = test.fillna(0)
total = test.isnull().sum().sort_values(ascending=False)
missing_data = pd.concat([total], axis=1, keys=['Total'])
missing_data

Unnamed: 0,Total
BsmtFinSF1,0
Fireplaces,0
YearRemodAdd,0
YearBuilt,0
FullBath,0
TotalBsmtSF,0
GarageCars,0
GrLivArea,0
OverallQual,0


In [71]:
test = test.values
pred = my_model.predict(test)

In [72]:
i_d = test_data.Id
i_d = i_d.values

In [73]:
pred = np.array([pred])
i_d = np.array([i_d])

In [74]:
i_d.reshape(-1,1)
np.exp(pred).reshape(-1,1)

array([[123111.086],
       [144699.42 ],
       [174915.06 ],
       ...,
       [164434.92 ],
       [117229.445],
       [244926.   ]], dtype=float32)

In [75]:
submission = np.concatenate((i_d.reshape(-1,1),np.exp(pred).reshape(-1,1)),axis=1)

In [76]:
sub = pd.DataFrame(submission,columns=['Id','SalePrice'])

In [77]:
sub.Id = sub.Id.astype('int64')
sub.head(5)

Unnamed: 0,Id,SalePrice
0,1461,123111.085938
1,1462,144699.421875
2,1463,174915.0625
3,1464,180689.796875
4,1465,198328.015625


In [78]:
sub_csv = sub.to_csv(r'submission.csv',index = None, header=True)