In [372]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from xgboost import XGBRegressor

from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [373]:
from sklearn.metrics import mean_squared_error

answers = pd.read_csv("./result-with-best.csv")


def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


def answers_score(model):
    print("RMSLE sub: " + str(rmsle(model.predict(validation), np.log1p(answers["SalePrice"]))))


def evaluate(model, X, y):
    preds = model.predict(X)
    print("RMSLE: " + str(rmsle(preds, y)))


def to_categorical(X):
    for c in X.columns:
        col_type = X[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[c] = X[c].astype('category')

In [374]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])
#data = remove_outliers(data)

In [375]:
y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

In [376]:
transformer = DataTransformer()
X_encoded = transformer.fit_transform(X, True)
X_cat = transformer.fit_transform(X, False)
X_scaled = transformer.fit_transform(X, True, True)

to_categorical(X_cat)
cat_features = list(X_cat.select_dtypes(include=['category']).columns)

validation = pd.read_csv("./test.csv")
val_ids = validation["Id"]
validation = validation.drop(columns=["Id"])

val_encoded = transformer.fit_transform(validation, True)
val_cat = transformer.fit_transform(validation, False)
val_scaled = transformer.fit_transform(validation, True, True)

to_categorical(val_cat)

In [233]:
xgb_params = {'lambda': 11.946656615633028,
              'learning_rate': 0.002119415669803155,
              'max_depth': 863,
              'n_estimators': 30000,
              'subsample': 0.1382402507540342}

xgb_model = XGBRegressor(**xgb_params, seed=0)
xgb_model.fit(X_encoded, y)
evaluate(xgb_model, val_encoded, np.log1p(answers['SalePrice']))

RMSLE: 0.12300759445353592


In [383]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

#data = remove_outliers(data, True)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

transformer = DataTransformer(StandardScaler())
X = transformer.prepare(X)

transformer.fit(X)

X = transformer.transform(X, False)

to_categorical(X)

cat_features = list(X.select_dtypes(include=['category']).columns)

validation = pd.read_csv("./test.csv")
val_ids = validation["Id"]
validation = validation.drop(columns=["Id"])

validation = transformer.prepare(validation)
validation = transformer.transform(validation, False)
to_categorical(validation)

In [414]:
lgbm_params = {'colsample_bytree': 0.22187412027782807, 'learning_rate': 0.013507845012253343, 'max_depth': 25,
               'n_estimators': 8000, 'num_leaves': 8, 'reg_alpha': 0.4389524156775603,
               'reg_lambda': 0.6, 'subsample': 0.05747795513890018}
fit_params = {
    "eval_metric": 'rmse',
    'verbose': -1,
    'categorical_feature': cat_features
}

lgbm_model = lgb.LGBMRegressor(**lgbm_params, num_iterations=2500)
lgbm_model.fit(X_cat, y, **fit_params)
evaluate(lgbm_model, X_cat, y)
evaluate(lgbm_model, val_cat, np.log1p(answers['SalePrice']))

RMSLE: 0.06402969992112549
RMSLE: 0.12239091907121474


In [406]:
categorical_features = np.where(X_cat.loc[:, X_cat.columns.values].dtypes == "category")[0]

train_pool = Pool(X_cat, y, cat_features=categorical_features)

cat_model = CatBoostRegressor(iterations=350)
cat_model.fit(train_pool, verbose=0, plot=False)

evaluate(cat_model, X_cat, y)
evaluate(cat_model, val_cat, np.log1p(answers['SalePrice']))

RMSLE: 0.07011095557812468
RMSLE: 0.12634768116178863


In [410]:
from sklearn.ensemble import StackingRegressor
stack_gen = StackingRegressor(estimators=(('xgb1', XGBRegressor(**xgb_params)), ('xgb2', XGBRegressor(**xgb_params)), ('xgb3', XGBRegressor(**xgb_params))))
stack_gen.fit(X_encoded, y)
evaluate(stack_gen, val_encoded, np.log1p(answers['SalePrice']))

RMSLE: 0.12320233371375372


In [404]:
def blend(models, X):
    pred_res =  [models[i].predict(X[i]) for i in range(len(models))]
    return np.sum(pred_res, axis=0) / len(models)

In [423]:
blended_res = blend([xgb_model, lgbm_model], [val_encoded, val_cat])
print("RMSLE: " + str(rmsle(blended_res, np.log1p(answers['SalePrice']))))

RMSLE: 0.11862587399633419


In [416]:
save_res(blended_res)