In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
from xgboost import XGBRegressor

In [4]:
X_full = pd.read_csv('dataset/Housing Prices Competition/train.csv')
X_test_full = pd.read_csv('dataset/Housing Prices Competition/test.csv')


In [5]:
X_test_full.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
X_full.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
# object_col = X_full.select_dtypes(include='object').columns
numeric_col = X_full.select_dtypes(include=['int64', 'float64']).columns

In [10]:
SalePrice_corr = X_full[numeric_col].corr().loc['SalePrice']

In [11]:
best_col = SalePrice_corr.sort_values(ascending=False)[SalePrice_corr.sort_values(ascending=False) >= 0.5].index

In [12]:
best_col = best_col.drop('SalePrice')

In [13]:
X_full[best_col].dtypes

OverallQual     int64
GrLivArea       int64
GarageCars      int64
GarageArea      int64
TotalBsmtSF     int64
1stFlrSF        int64
FullBath        int64
TotRmsAbvGrd    int64
YearBuilt       int64
YearRemodAdd    int64
dtype: object

In [14]:
y = X_full['SalePrice']
features = best_col
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [15]:
# Split Data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                     train_size=.8,
                                                     random_state=0)

In [16]:
print(X_train.shape)
print(X_valid.shape)

(1168, 10)
(292, 10)


In [40]:
# Define the models

model_1 = XGBRegressor(n_estimators=100, learning_rate=.05, n_jobs=4)
model_2 = XGBRegressor(n_estimators=250, learning_rate=.05, n_jobs=4)
model_3 = XGBRegressor(n_estimators=500, learning_rate=.05, n_jobs=4)
model_4 = XGBRegressor(n_estimators=750, learning_rate=.05, n_jobs=4)
model_5 = XGBRegressor(n_estimators=1000, learning_rate=.05, n_jobs=4)

models = [model_1, model_2, model_3, model_4, model_5]

In [55]:
for i in np.arange(0, 1001 + 500, 50):
    model = XGBRegressor(n_estimators=i,
                         learning_rate=.005, 
                         n_jobs=4)
    # best model
    # n_estimators=600, learning_rate=0.01   18557.75782587757
    # n_estimators=150, learning_rate=0.05   18557.75782587757
    # n_estimators=1300, learning_rate=0.005 18481.781918878423

    
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    print(f'n_estimators {i} :\t{mean_absolute_error(y_valid, preds)}')

n_estimators 0 :	181369.88356164383
n_estimators 50 :	46107.115100599316
n_estimators 100 :	38081.30067422945
n_estimators 150 :	32147.55361729452
n_estimators 200 :	27694.03427333048
n_estimators 250 :	24464.408096104453
n_estimators 300 :	22349.276434075342
n_estimators 350 :	20761.130645333906
n_estimators 400 :	19759.489324700342
n_estimators 450 :	19101.555971746577
n_estimators 500 :	18731.281383775684
n_estimators 550 :	18588.653735017124
n_estimators 600 :	18570.71500428082
n_estimators 650 :	18595.703312285958
n_estimators 700 :	18631.796875
n_estimators 750 :	18680.07662671233
n_estimators 800 :	18646.78718964041
n_estimators 850 :	18597.326519691782
n_estimators 900 :	18545.81319563356
n_estimators 950 :	18513.090566138697
n_estimators 1000 :	18506.078753745718
n_estimators 1050 :	18486.669721211474
n_estimators 1100 :	18492.275444135274
n_estimators 1150 :	18503.786186322774
n_estimators 1200 :	18491.34500749144
n_estimators 1250 :	18485.213679901542
n_estimators 1300 :	184

In [57]:
best_model = XGBRegressor(n_estimators=1300, learning_rate=.005, n_jobs=4)
best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)

In [58]:
output = pd.DataFrame(
    {
        'Id': X_test_full.Id,
        'SalePrice': preds
    }
)

In [59]:
output

Unnamed: 0,Id,SalePrice
0,1461,125485.890625
1,1462,149376.187500
2,1463,165415.531250
3,1464,181873.156250
4,1465,208386.640625
...,...,...
1454,2915,77970.390625
1455,2916,83654.164062
1456,2917,156476.390625
1457,2918,103008.617188


In [60]:
output.to_csv('submission_xgboost_ver_1.csv', index=False)