In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
import optuna

In [6]:
train=pd.read_csv('train.csv')
numeric_train_list=['LotFrontage', 'LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','YrSold','BedroomAbvGr','KitchenAbvGr']
categorical_train_list=['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','OverallQual','OverallCond','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
target_list=['SalePrice']
numeric_train=train.loc[:,numeric_train_list]
categorical_train=train[categorical_train_list]
target=train[target_list]
# As there are too many missing values in PoolQC, Fence
categorical_train=categorical_train.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'],axis=1)
# we can classify the missing one into one category
categorical_train.loc[categorical_train.GarageType.isna(),['GarageType','GarageFinish','GarageCond','GarageQual']]='no_garage'
numeric_train.loc[numeric_train.GarageYrBlt.isna(),'GarageYrBlt']=numeric_train.GarageYrBlt.min()
numeric_train=numeric_train.drop('LotFrontage',axis=1)
numeric_train.loc[numeric_train.MasVnrArea.isna(),:]=0
categorical_train['Electrical']=categorical_train['Electrical'].ffill()
categorical_train.loc[:,[name for name in categorical_train.columns if 'Bsmt' in name]]=categorical_train.loc[:,[name for name in categorical_train.columns if 'Bsmt' in name]].fillna('No_Bsmt')
categorical_train=categorical_train.astype('category')

In [7]:
cat_df_onehot=pd.get_dummies(categorical_train,drop_first=True,dtype='float')

In [8]:
keep_list=abs(pd.concat([numeric_train,target],axis=1).corr()['SalePrice'])>0.05
numeric_train=numeric_train.loc[:,keep_list]

In [9]:
df=pd.concat([numeric_train,cat_df_onehot,target],axis=1)

In [10]:
test=pd.read_csv('test.csv')
numeric_train_list=['LotFrontage', 'LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','YrSold','BedroomAbvGr','KitchenAbvGr']
categorical_train_list=['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','OverallQual','OverallCond','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
numeric_test=test.loc[:,numeric_train_list]
categorical_test=test.loc[:,categorical_train_list]
# As there are too many missing values in PoolQC, Fence
categorical_test=categorical_test.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'],axis=1)
# we can classify the missing one into one category
categorical_test.loc[categorical_test.GarageType.isna(),['GarageType','GarageFinish','GarageCond','GarageQual']]='no_garage'
numeric_test.loc[numeric_test.GarageYrBlt.isna(),'GarageYrBlt']=numeric_test.GarageYrBlt.min()
numeric_test=numeric_test.drop('LotFrontage',axis=1)
numeric_test.loc[numeric_test.MasVnrArea.isna(),:]=0
categorical_test['Electrical']=categorical_test['Electrical'].ffill()
categorical_test.loc[:,[name for name in categorical_test.columns if 'Bsmt' in name]]=categorical_test.loc[:,[name for name in categorical_test.columns if 'Bsmt' in name]].fillna('No_Bsmt')
categorical_test=categorical_test.astype('category')

In [11]:
numeric_test=numeric_test.loc[:,keep_list]

In [12]:
cat_test_onehot=pd.get_dummies(categorical_test,drop_first=True,dtype='float')

In [13]:
need_added=[i for i in cat_df_onehot.columns if i not in cat_test_onehot]

In [14]:
need_drop=[i for i in cat_test_onehot.columns if i not in cat_df_onehot]

In [15]:
drop_col=list(set([item.split('_')[0] for item in (need_drop+need_added)])) 

In [16]:
categorical_train=categorical_train.drop(drop_col,axis=1)
categorical_test=categorical_test.drop(drop_col,axis=1)

In [17]:
cat_df_onehot=pd.get_dummies(categorical_train,drop_first=True,dtype='float')
cat_test_onehot=pd.get_dummies(categorical_test,drop_first=True,dtype='float')

In [19]:
len(cat_df_onehot.columns), len(cat_test_onehot.columns)

(154, 154)

In [20]:
df=pd.concat([cat_df_onehot,numeric_train,target],axis=1)
df_test=pd.concat([cat_test_onehot,numeric_test],axis=1)

In [21]:
X=df.drop('SalePrice',axis=1).values
y=df.SalePrice.values

In [None]:
def objective(trial, X=X, y=y):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna.
    """
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4)

    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100),
        'eta': trial.suggest_float("eta", 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'gamma': trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0)
    }

    reg = xgb.XGBRegressor(**params)
    reg.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], eval_metric='rmse',
            verbose=False)
    return mean_squared_error(y_valid, reg.predict(X_valid), squared=False)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 10)

# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgbr_model=xgb.XGBRegressor(max_depth=12, subsample= 0.4705396827126213, n_estimators=1900, eta= 0.007281420490092281, alpha= 0.294862259973789, reg_lambda= 7.810905751631526e-05, gamma= 0.00010468340602977493, min_child_weight= 4, grow_policy= 'depthwise', colsample_bytree= 0.4304997219138718)
xgbr_model.fit(X_train,y_train)
y_pred=xgbr_model.predict(X_test)
print(f'xgbr_model_train_score:{xgbr_model.score(X_train,y_train)}')
print(f'xgbr_model_test_score:{xgbr_model.score(X_test,y_test)}')
print(f'xgbr_model_r2_score:{r2_score(y_test,y_pred)}')

xgbr_model_train_score:0.9949309006730995
xgbr_model_test_score:0.8857444275999518
xgbr_model_r2_score:0.8857444275999518


In [23]:
X_test2=df_test.values
result=pd.Series(xgbr_model.predict(X_test2))
result_df=pd.concat([test.Id,result],axis=1)
result_df.columns=['Id','SalePrice']
result_df.Id=result_df.Id.astype('Int32')
result_df.to_csv('result.csv',index=False)