In [9]:
import pandas as pd
import numpy as np

from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from lightgbm import LGBMRegressor

from fit_util import *

In [10]:
train_df = load_data()
transformer = DataTransformer()
train_df = transformer.fit_transform(train_df)

y = train_df['SalePrice']
X = train_df.drop(columns='SalePrice', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

In [11]:
parameters = {
    "learning_rate": loguniform(5e-3,  1e-1),
    "max_depth": [10, 100, 200, 500, 1000, 3000, 4000, 5000],
    "n_estimators": [1, 10, 20, 100, 1000, 5000, 7000, 9000, 10000],
    "min_child_samples": [1, 2, 5, 10, 15, 20],
    "subsample_freq": [3, 5, 7, 10, 12, 15],
    "reg_lambda" : loguniform(1e-4, 10)
    }

grid_model = RandomizedSearchCV(LGBMRegressor(seed=0), parameters, cv=3, n_jobs=-1)
grid_model.fit(X_train, y_train);

In [12]:
print(grid_model.best_params_)

{'learning_rate': 0.09859074954616912, 'max_depth': 100, 'min_child_samples': 2, 'n_estimators': 10000, 'reg_lambda': 0.2698962582576065, 'subsample_freq': 15}


Evaluating on train set

In [13]:
gs_model = LGBMRegressor(**grid_model.best_params_, seed=0)
gs_model.fit(X_train, y_train);

In [14]:
evaluate(gs_model, X_train, y_train, "train ")
evaluate(gs_model, X_test, y_test, "test ")

train RMSE: 0.0011844375790452304
test RMSE: 0.13309106169047022


In [15]:
submission(transformer=transformer, gs_model=gs_model, obj_to_num=True)

RMSLE submission: 0.3837947671336009
