In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor, Pool

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

In [12]:
from sklearn.metrics import mean_squared_error

cheat = pd.read_csv("./result-with-best.csv")

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cheat_score(model, val):
    print("RMSLE sub: " + str(rmsle(model.predict(val), np.log1p(cheat["SalePrice"]))))

def to_categorical(X):
    for c in X.columns:
        col_type = X[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[c] = X[c].astype('category')

def evaluate(model, X, y):
    preds = model.predict(X)
    print("RMSLE: " + str(rmsle(preds, y)))

In [14]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

data = remove_outliers(data)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

Int64Index([249, 313, 335, 523, 706, 1298], dtype='int64')


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

In [16]:
X_train_orig = X_train.copy()

transformer = DataTransformer(StandardScaler())

X_train = transformer.prepare(X_train)
X_test = transformer.prepare(X_test)

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

(1163, 24) (1163, 338)
(291, 24) (291, 338)


In [11]:
from hyperopt.pyll import scope
import lightgbm as lgb
from hyperopt import hp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import clone
from hyperopt import fmin, tpe, hp, anneal, Trials

folds = 4
shuffle = True

random_state = 42

def objective(params):
    #params = {'n_estimators': int(params['n_estimators']),
    #          'max_depth': int(params['max_depth']),
    #          'learning_rate': params['learning_rate'],
    #          'num_leaves': int(params['num_leaves'])}

    clf = xgb.XGBRegressor(random_state=random_state, **params)

    kf = KFold(n_splits=folds, shuffle=shuffle)
    loss_list = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        clf_fold = clone(clf)
        clf_fold.fit(X_train_fold, y_train_fold, early_stopping_rounds=15, eval_set=[(X_test_fold, y_test_fold)], verbose=False)

        prediction = clf_fold.predict(X_test_fold)
        loss = rmsle(prediction, y_test_fold)

        loss_list.append(loss)

    score = np.mean(loss_list)

    #print("{:.3f} params {}".format(score, params))
    return score


space = {'n_estimators': scope.int(hp.quniform('n_estimators', 100, 100000, 1)),
         'max_depth': scope.int(hp.quniform('max_depth', 2, 30, 1)),
         'eta': hp.loguniform('eta', -5, 0),
         'colsample_bytree': hp.loguniform('colsample_bytree', -5, 0),
         'min_child_weight': hp.loguniform('min_child_weight', -5, 2),
         'alpha': hp.loguniform('alpha', -5, 0),
         'reg_lambda': hp.loguniform('reg_lambda', -5, 0),
         'subsample': hp.loguniform('subsample', -5, 0),
         }

trials = Trials()

best = fmin(fn=objective,  # function to optimize
            space=space,
            algo=tpe.suggest,  # optimization algorithm, hyperotp will select its parameters automatically
            max_evals=100,  # maximum number of iterations
            trials=trials,  # logging
            rstate=np.random.default_rng(random_state)  # fixing random state for the reproducibility
            )

print("best {}".format(best))

100%|██████████| 100/100 [11:44<00:00,  7.05s/trial, best loss: 0.12028953145159352]
best {'alpha': 0.0674727477206007, 'colsample_bytree': 0.9953230146860792, 'eta': 0.007149636020967451, 'max_depth': 22.0, 'min_child_weight': 0.31119724613862315, 'n_estimators': 97427.0, 'reg_lambda': 0.03359599334141284, 'subsample': 0.6425926905018116}


In [42]:
xgb_model = xgb.XGBRegressor(
    booster= 'dart',
    random_state=random_state,
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    eta=best['eta'],
    colsample_bytree=best['colsample_bytree'],
    min_child_weight=best['min_child_weight'],
    alpha=best['alpha'],
    reg_lambda=best['reg_lambda'],
    subsample=best['subsample'],
    )

xgb_model.fit(X_train, y_train, early_stopping_rounds=15, eval_set=[(X_test, y_test)], verbose=False)

evaluate(xgb_model, X_train, y_train)
evaluate(xgb_model, X_test, y_test)

RMSLE: 0.020185086449640757
RMSLE: 0.114825540235046


In [24]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

#data = remove_outliers(data)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

transformer = DataTransformer(StandardScaler())
X = transformer.prepare(X)

transformer.fit(X)

X = transformer.transform(X)

cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]

(1460, 24) (1460, 342)


In [40]:
xgb_model = xgb.XGBRegressor(
    #booster= 'dart',
    random_state=random_state,
    n_estimators= 1500,#int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    eta=best['eta'],
    colsample_bytree=best['colsample_bytree'],
    min_child_weight=best['min_child_weight'],
    alpha=best['alpha'],
    reg_lambda=best['reg_lambda'],
    subsample=best['subsample'],
)

xgb_model.fit(X, y, verbose=False)

evaluate(xgb_model, X, y)

RMSLE: 0.011127637999111832


In [41]:
validation = pd.read_csv("./test.csv")
val_ids = validation["Id"]
validation = validation.drop(columns=["Id"])

validation = transformer.prepare(validation)
validation = transformer.transform(validation, True)

sub_predictions = xgb_model.predict(validation)
print("RMSLE submission: " + str(rmsle(sub_predictions, np.log1p(cheat["SalePrice"]))))
#evaluate(lbg_model, validation,  np.log1p(cheat["SalePrice"]))

(1459, 24) (1459, 342)
RMSLE submission: 0.1360159096943327
