Trial for Boston House Pricing

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

train = pd.read_csv("./temp_dataset/train.csv")
test = pd.read_csv("./temp_dataset/test.csv")
submission = pd.read_csv("./temp_dataset/sample_submission.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
y = train['SalePrice']
X = train.drop(['Id', 'SalePrice'], axis=1)
X_test = test.drop(['Id'], axis=1)
X = X.fillna(0)
X_test = X_test.fillna(0)

X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)


In [10]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)


In [11]:
predictions = model.predict(X_test)
submission['SalePrice'] = predictions
submission.to_csv('solution.csv', index=False)


In [12]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X, y)
submission['SalePrice'] = xgb_model.predict(X_test)
submission.to_csv('solution_xgb.csv', index=False)


In [None]:
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

stacked_model = StackingRegressor(estimators=[
    ('xgb', xgb.XGBRegressor()),
    ('lgb', lgb.LGBMRegressor()),
    ('cat', CatBoostRegressor(verbose=0))
], final_estimator=lgb.LGBMRegressor())

stacked_model.fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3489
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 202
[LightGBM] [Info] Start training from score 180921.195890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3203
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 196
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

In [14]:
submission['SalePrice'] = stacked_model.predict(X_test)
submission.to_csv('solution_stacked.csv', index=False)

