In [9]:
import numpy as np

from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor

from fit_util import *

In [12]:
train_df = load_data()
transformer = DataTransformer()
train_df = transformer.fit_transform(train_df)

y = train_df['SalePrice']
X = train_df.drop(columns='SalePrice', axis=1)

X_train = X.iloc[np.random.choice(np.arange(len(X)), 300, False)]
y_train = y[X_train.index]

In [13]:
parameters = {
    "learning_rate": loguniform(5e-3,  1e-1),
    "max_depth": [10, 100, 200, 500, 1000, 3000, 4000, 5000],
    "n_estimators": [1, 10, 20, 100, 1000, 5000, 7000, 9000, 10000],
    "max_depth": [185],
    "n_estimators": [7374],
    "lambda" : loguniform(1e-4, 10),
    "min_child_weight" : loguniform(0.1, 55),
    "gamma" : loguniform(1e-4, 10)
    }

grid_model = RandomizedSearchCV(XGBRegressor(seed=0), parameters, cv=3, n_jobs=-1)
grid_model.fit(X_train, y_train);

In [14]:
print(grid_model.best_params_)

{'gamma': 0.0005233793595180316, 'lambda': 5.6085074951963945, 'learning_rate': 0.021792504938199782, 'max_depth': 185, 'min_child_weight': 9.653704437705056, 'n_estimators': 7374}


In [15]:
gs_model = XGBRegressor(**grid_model.best_params_, seed=0)

train_df = load_data()
transformer = DataTransformer()
train_df = transformer.fit_transform(train_df)

y = train_df['SalePrice']
X = train_df.drop(columns='SalePrice', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

gs_model.fit(X_train, y_train);

In [16]:
evaluate(gs_model, X_train, y_train, "train ")
evaluate(gs_model, X_test, y_test, "test ")

train RMSE: 0.010515217474039724
test RMSE: 0.12974631238370893


In [17]:
submission(transformer=transformer, gs_model=gs_model, obj_to_num=True)

RMSLE submission: 0.38565207831952475
