In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

In [2]:
hp_train = pd.read_csv('train.csv', index_col='Id')
hp_test = pd.read_csv('test.csv', index_col='Id')

In [3]:
X = hp_train.drop('SalePrice', axis=1)
y = hp_train['SalePrice']

In [4]:
# Separete numeric and categorical columns
num_col = X.dtypes[X.dtypes != 'object'].index.to_list()
cat_col = X.dtypes[X.dtypes == 'object'].index.to_list()

In [5]:
# Get all columns with missing values
num_col_mis_val = X[num_col].dtypes[X[num_col].isna().sum()>0].index.to_list()
cat_col_mis_val = X[cat_col].dtypes[X[cat_col].isna().sum()>0].index.to_list()

In [6]:
X[num_col_mis_val] = SimpleImputer(strategy='mean').fit_transform(X[num_col_mis_val])
X[cat_col_mis_val] = X[cat_col_mis_val].fillna('Missing')

In [8]:
X[num_col] = RobustScaler().fit_transform(X[num_col])
X[cat_col] = X[cat_col].apply(lambda x: pd.factorize(x)[0])
y = np.log1p(y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [43]:
'''parameters = {
    "n_estimators": [500, 750, 1000, 1500, 2000], 
    "learning_rate": [0.01, 0.02, 0.05], 
    "max_depth": [6, 8], 
    "subsample": [0.3, 0.5, 0.7]
}

grid = GridSearchCV(XGBRegressor(objective='reg:squarederror'), parameters)
grid.fit(X_train, y_train)

print(grid.best_params_)'''

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 2000, 'subsample': 0.7}


In [38]:
xgb_model = XGBRegressor(objective='reg:squarederror', n_jobs=8, learning_rate=0.01, max_depth=6, n_estimators=1500,
subsample=0.7, early_stopping_rounds=10).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [39]:
print("-----")
print("* Training set")
y_pred = xgb_model.predict(X_train)
print(f"R2: {r2_score(y_train, y_pred):.2%}")
print(f"RMSE: {mean_squared_log_error(y_train, y_pred, squared=False):.5f}")

print("-----")
print("* Validation set")
y_pred = xgb_model.predict(X_test)
print(f"R2: {r2_score(y_test, y_pred):.2%}")
print(f"RMSE: {mean_squared_log_error(y_test, y_pred, squared=False):.5f}")

-----
* Training set
R2: 99.02%
RMSE: 0.00308
-----
* Validation set
R2: 89.42%
RMSE: 0.00959
