In [411]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor, Pool

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [417]:
from sklearn.metrics import mean_squared_error

cheat = pd.read_csv("./result-with-best.csv")

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cheat_score(model, val):
    print("RMSLE sub: " + str(rmsle(model.predict(val), np.log1p(cheat["SalePrice"]))))

def to_categorical(X):
    for c in X.columns:
        col_type = X[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[c] = X[c].astype('category')

def evaluate(model, X, y):
    preds = model.predict(X)
    print("RMSLE: " + str(rmsle(preds, y)))

In [430]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

data = remove_outliers(data, False)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

Int64Index([249, 313, 335, 523, 706, 1298], dtype='int64')


In [431]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

#X_train, y_train = remove_outliers_split(X_train, y_train)

In [432]:
X_train_orig = X_train.copy()

transformer = DataTransformer(StandardScaler())

X_train = transformer.prepare(X_train)
X_test = transformer.prepare(X_test)

transformer.fit(X_train)

X_train = transformer.transform(X_train, False)
X_test = transformer.transform(X_test, False)

In [433]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF,Total_sqr_footage,Total_porch_sf
437,45.0,RM,50.000000,6000.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,1,2009,WD,Normal,1788.0,904.0,105.0
1410,60.0,RL,79.000000,12420.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,6,2009,WD,Normal,2784.0,2506.0,45.0
833,20.0,RL,100.000000,10004.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,2,2009,WD,Normal,3032.0,2057.0,152.0
1177,50.0,RM,81.459225,3950.0,Pave,Grvl,Reg,Bnk,Inside,Gtl,...,,,0.0,12,2009,WD,Normal,2042.0,1692.0,116.0
723,50.0,RL,60.000000,8172.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,5,2008,WD,Normal,2411.0,1470.0,156.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,20.0,RL,87.480748,6853.0,Pave,,IR1,Lvl,Inside,Gtl,...,,,0.0,6,2009,WD,Normal,2563.0,2301.0,217.0
1293,60.0,RL,78.000000,10140.0,Pave,,Reg,Lvl,Inside,Gtl,...,GdWo,,0.0,3,2006,WD,Normal,2496.0,1858.0,287.0
180,160.0,FV,37.252530,2117.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,6,2007,WD,Normal,2281.0,1961.0,0.0
1295,20.0,RL,70.000000,8400.0,Pave,,Reg,Lvl,Inside,Gtl,...,GdWo,,0.0,11,2006,WD,Normal,2104.0,2068.0,356.0


In [434]:
to_categorical(X_train)
to_categorical(X_test)

cat_features = np.where(X_train.loc[:, X_train.columns.values].dtypes == "object")[0]

In [435]:
from hyperopt.pyll import scope
import lightgbm as lgb
from hyperopt import hp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import clone
from hyperopt import fmin, tpe, hp, anneal, Trials

folds = 4
shuffle = True

random_state = 42

def objective(params):
    #params = {'n_estimators': int(params['n_estimators']),
    #          'max_depth': int(params['max_depth']),
    #          'learning_rate': params['learning_rate'],
    #          'num_leaves': int(params['num_leaves'])}

    clf = lgb.LGBMRegressor(random_state=random_state,
                            #boosting_type ='dart',

                            **params)

    kf = KFold(n_splits=folds, shuffle=shuffle)
    loss_list = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        fit_params = {"early_stopping_rounds": 20,
                      "eval_metric": 'rmse',
                      "eval_set": [(X_test_fold, y_test_fold)],
                      'verbose': -1,
                      'categorical_feature': cat_features
                      }

        clf_fold = clone(clf)
        clf_fold.fit(X_train_fold, y_train_fold, **fit_params)

        prediction = clf_fold.predict(X_test_fold)
        loss = rmsle(prediction, y_test_fold)

        loss_list.append(loss)

    score = np.mean(loss_list)

    #print("{:.3f} params {}".format(score, params))
    return score


space = {'n_estimators': scope.int(hp.quniform('n_estimators', 100, 20000, 1)),
         'max_depth': scope.int(hp.quniform('max_depth', 2, 30, 1)),
         'num_leaves': scope.int(hp.quniform('num_leaves', 2, 40, 1)),
         'reg_alpha': hp.loguniform('reg_alpha', -5, 0),
         'reg_lambda': hp.loguniform('reg_lambda', -5, 0),
         'learning_rate': hp.loguniform('learning_rate', -5, 0),
         'subsample': hp.loguniform('subsample', -5, 0),
         'colsample_bytree': hp.loguniform('colsample_bytree', -5, 0),
         }
trials = Trials()

best = fmin(fn=objective,  # function to optimize
            space=space,
            algo=tpe.suggest,  # optimization algorithm, hyperotp will select its parameters automatically
            max_evals=50,  # maximum number of iterations
            trials=trials,  # logging
            rstate=np.random.default_rng(random_state)  # fixing random state for the reproducibility
            )

print("best {}".format(best))

100%|██████████| 50/50 [01:11<00:00,  1.42s/trial, best loss: 0.11970353460726135]
best {'colsample_bytree': 0.25888178212037616, 'learning_rate': 0.08160717473914599, 'max_depth': 24.0, 'n_estimators': 7510.0, 'num_leaves': 6.0, 'reg_alpha': 0.2583432050369779, 'reg_lambda': 0.029082211206149948, 'subsample': 0.019754607872640946}


In [439]:
fit_params = {
    "early_stopping_rounds": 20,
    "eval_set": [(X_test, y_test)],
    "eval_metric": 'rmse',
    'verbose': -1,
    'categorical_feature': cat_features
}
lbg_model = lgb.LGBMRegressor(random_state=random_state,
                              #boosting_type ='dart',
                              n_estimators=int(best['n_estimators']),
                              max_depth=int(best['max_depth']),
                              num_leaves = int(best['num_leaves']),
                              reg_alpha=best['reg_alpha'],
                              reg_lambda=best['reg_lambda'],
                              learning_rate=best['learning_rate'],
                              subsample=best['subsample'],
                              colsample_bytree=best['colsample_bytree'],
                              )
lbg_model.fit(X_train, y_train, **fit_params)

evaluate(lbg_model, X_train, y_train)
evaluate(lbg_model, X_test, y_test)

RMSLE: 0.0775345956950465
RMSLE: 0.11577414504800097


In [444]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

#data = remove_outliers(data, True)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

transformer = DataTransformer(StandardScaler())
X = transformer.prepare(X)

transformer.fit(X)

X = transformer.transform(X, False)

to_categorical(X)

cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]

In [467]:
fit_params = {
              #"early_stopping_rounds": 20,
              #"eval_set": [(X_test, y_test)],
              "eval_metric": 'rmse',
              'verbose': -1,
              'categorical_feature': cat_features,
              }
lbg_model = lgb.LGBMRegressor(random_state=random_state,
                              #boosting_type ='dart',
                              n_estimators=int(best['n_estimators']),
                              max_depth=int(best['max_depth']),
                              num_leaves = int(best['num_leaves']),
                              reg_alpha=best['reg_alpha'],
                              reg_lambda=best['reg_lambda'],
                              learning_rate=best['learning_rate'],
                              subsample=best['subsample'],
                              colsample_bytree=best['colsample_bytree'],
                              num_iterations = 800
                              )
lbg_model.fit(X, y, **fit_params)

evaluate(lbg_model, X, y)
#evaluate(lbg_model, X_test, y_test)

RMSLE: 0.055514938801319406


In [468]:
validation = pd.read_csv("./test.csv")
val_ids = validation["Id"]
validation = validation.drop(columns=["Id"])

validation = transformer.prepare(validation)
validation = transformer.transform(validation, False)
to_categorical(validation)

sub_predictions = lbg_model.predict(validation)
print("RMSLE submission: " + str(rmsle(sub_predictions, np.log1p(cheat["SalePrice"]))))
#evaluate(lbg_model, validation,  np.log1p(cheat["SalePrice"]))

RMSLE submission: 0.12490467891858428


In [429]:
d = {'Id': val_ids.to_numpy(), 'SalePrice':  np.expm1(sub_predictions)}
df = pd.DataFrame(data=d)
df.to_csv('submission.csv', index=False)

df

Unnamed: 0,Id,SalePrice
0,1461,124840.857397
1,1462,165939.897319
2,1463,181642.320730
3,1464,191590.442564
4,1465,188917.423575
...,...,...
1454,2915,82359.158448
1455,2916,83041.141198
1456,2917,158942.735912
1457,2918,114650.118548
