## Importing libraries


In [None]:
import gc  # garbage collector
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pylab

pd.set_option("display.max_columns", None)

import warnings

warnings.filterwarnings("ignore")

from helper import utility as ut
import importlib

importlib.reload(ut)

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb

## Load Dataset - Common for both Models


In [None]:
prop_2016 = ut.load_properties_data("clean_data/prop_2016_clean.csv")
prop_2017 = ut.load_properties_data("clean_data/prop_2017_clean.csv")
train = ut.load_properties_data("clean_data/train_combined.csv")

train.dtypes

## Catboost Model


In [None]:
# Dropping columns which do not perform well when we input to the catboost model
catboost_features = ut.drop_features(train)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head(5)

# Prepare feature list for catboost model
categorical_features = [
    "airconditioningtypeid",
    "heatingorsystemtypeid",
    "propertylandusetypeid",
    "year",
    "month",
    "quarter",
    "buildingclasstypeid",
]
for col in catboost_features.columns:
    if col in categorical_features:
        catboost_features[col] = catboost_features[col].astype("str")

In [None]:
# Prepare training and cross-validation data
catboost_label = train.logerror.astype(np.float32)
print(catboost_label.head())

# Transform to Numpy matrices
catboost_X = catboost_features.values
catboost_y = catboost_label.values

# Perform shuffled train/test split
X_train, X_val, y_train, y_val = train_test_split(
    catboost_X, catboost_y, test_size=0.2, random_state=99
)
ut.remove_outliers(X_train, y_train)

In [None]:
# Specify feature names and categorical features for CatBoost
categorical_indices = ut.get_categorical_indices(
    catboost_features, categorical_features
)

In [None]:
ut.remove_outliers(catboost_X, catboost_y)

model = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    nan_mode="Min",
    random_seed=99,
    iterations=1000,
    learning_rate=0.015,
    border_count=254,
    max_depth=6,
    random_strength=1,
    l2_leaf_reg=5,
    bagging_temperature=1,
    verbose=True,
)
model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=False)

# Sanity check: score on a small portion of the dataset
print("sanity check score: {}".format(abs(model.predict(X_val) - y_val).mean()))

In [None]:
file_name = "submission/final_catboost_single.csv"
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(
    [model], prop_2016, prop_2017, file_name
)

### Catboost with 4x ensemble


In [None]:
# Train multiple models
rounds = 8
models = []
for i in range(rounds):
    print("Start training model {}".format(i))
    model = CatBoostRegressor(
        loss_function="MAE",
        eval_metric="MAE",
        nan_mode="Min",
        random_seed=99,
        iterations=1000,
        learning_rate=0.015,
        border_count=254,
        max_depth=6,
        random_strength=1,
        l2_leaf_reg=5,
        bagging_temperature=1,
        verbose=True,
    )
    model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=True)
    models.append(model)

In [None]:
# Make predictions and export results
file_name = 'submission/final_catboost_ensemble_x4.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(models, prop_2016, prop_2017, file_name)

## Lightgbm Model


In [None]:
#Drop not useful columns
lightgbm_features = ut.drop_features(train)
print("Number of features for Lightgbm: {}".format(len(lightgbm_features.columns)))
lightgbm_features.head(5)

In [None]:
# Prepare training and cross-validation data
lightgbm_label = train.logerror.astype(np.float32)
print(lightgbm_label.head())

# Transform to Numpy matrices
lightgbm_X = lightgbm_features.values
lightgbm_y = lightgbm_label.values

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(lightgbm_X, lightgbm_y, test_size=0.2 , random_state=99)

a,b=ut.remove_outliers(X_train, y_train)
X_train=a
y_train=b

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
# Specify feature names and categorical features for Lightgbm
categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusetypeid', 'year', 'month', 'quarter','buildingclasstypeid']
categorical_indices = ut.get_categorical_indices(lightgbm_features, categorical_features)

In [None]:
# Lightgbm parameters
params = {}

params["objective"] = "regression"
params["metric"] = "mae"
params["num_threads"] = 4  # set to number of real CPU cores for best performance

params["boosting_type"] = "gbdt"
params["num_boost_round"] = 1250
params["learning_rate"] = 0.003  # shrinkage_rate 
# params["early_stopping_rounds"] = 30  # Early stopping based on validation set performance 

# Control tree growing
params["num_leaves"] = 127  # max number of leaves in one tree (default 31)
params["min_data"] = 150  # min_data_in_leaf
params["min_hessian"] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params["max_depth"] = -1  # limit the max depth of tree model, defult -1 (no limit)
params[
    "max_bin"
] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params[
    "sub_feature"
] = 0.5  # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params["bagging_fraction"] = 0.7
params["bagging_freq"] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params[
    "min_data_per_group"
] = 100  # minimal number of data per categorical group (default 100)
params[
    "cat_smooth"
] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params["lambda_l1"] = 0.0
params["lambda_l2"] = 0.0

# Random seeds (keep default values)
params["feature_fraction_seed"] = 2
params["bagging_seed"] = 3

### Lightgbm Single Model


In [None]:
#Train Lightgbm
feature_names = [s for s in lightgbm_features.columns]
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=feature_names)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set,
                valid_sets=[lgb_train_set, lgb_valid_set], valid_names=['train', 'val'],
                categorical_feature=categorical_indices)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
# Plot LightGBM feature importance
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False)

In [None]:
# Train LightGBM on all given training data (preparing for submission)
#del params['early_stopping_rounds']

a,b=ut.remove_outliers(lightgbm_X,lightgbm_y)
lightgbm_X=a
lightgbm_y=b

lgb_train_set = lgb.Dataset(lightgbm_X, label=lightgbm_y, feature_name=feature_names)
print("lightgbm_X: {}".format(lightgbm_X.shape))
print("lightgbm_y: {}".format(lightgbm_y.shape))

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indices)

# Sanity check: make sure the model score is reasonable on a small portion of the data
print("score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
file_name = 'submission/final_lgb_single.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv([model], prop_2016, prop_2017, file_name)

### Lightgbm Ensemble 5x Model


In [None]:
a,b=ut.remove_outliers(lightgbm_X,lightgbm_y)
lightgbm_X=a
lightgbm_y=b

lgb_train_set = lgb.Dataset(lightgbm_X, label=lightgbm_y, feature_name=feature_names)

# Train multiple models
bags = 5
models = []
for i in range(bags):
    print("Start training model {}".format(i))
    params["seed"] = i
    np.random.seed(42)
    random.seed(36)
    model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indices)
    models.append(model)

# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val) - y_val).mean() * 100))

# Save the trained models to disk
ut.save_models(models,'lightgbm')

models = ut.load_lightgbm_models(['checkpoints/lightgbm_' + str(i) for i in range(bags)])  # load pretrained models 

In [None]:
# Make predictions and export results
file_name = 'submission/final_lgb_ensemble_x5.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(models, prop_2016, prop_2017, file_name)

## Stacking


In [None]:
lgb_single = pd.read_csv('submission/final_lgb_single.csv')
catboost_single = pd.read_csv('submission/final_catboost_single.csv')
print("Finished Loading the prediction results.")

def decimal_range(start, stop, increment):
    while start <= stop: 
        yield start
        start += increment

for weight in decimal_range(0.1, 0.9, 0.1):
    #weight = 0.7
    stack = pd.DataFrame()
    stack["ParcelId"] = lgb_single["ParcelId"]
    for col in ["201610", "201611", "201612", "201710", "201711", "201712"]:
        stack[col] = weight * catboost_single[col] + (1 - weight) * lgb_single[col]

    print(stack.head())
    #stack.to_csv("submission/final_stack.csv", index=False)
    stack.to_csv("submission/final_stack_catboostweight_" + str("{:.1f}".format(weight)) + ".csv", index=False)

### XGBoost


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc


In [None]:
xgboost_features = ut.drop_features(train)
print("Number of features for Xgboost: {}".format(len(xgboost_features.columns)))
xgboost_features.head(5)
xgboost_features = xgboost_features.fillna(xgboost_features.mean())
xgboost_features.replace([np.inf, -np.inf], 0, inplace=True)
xgboost_features

In [None]:
# Prepare training and cross-validation data
xgboost_label = train.logerror.astype(np.float32)
print(xgboost_label.head())

# Transform to Numpy matrices
xgboost_X = xgboost_features.values
xgboost_y = xgboost_label.values

for c in train.dtypes[train.dtypes == object].index.values:
    train[c] = (train[c] == True)

# Perform shuffled train/test split
X_train, X_val, y_train, y_val = train_test_split(xgboost_X, xgboost_y, test_size=0.2 , random_state=99)

a,b=ut.remove_outliers(X_train, y_train)
X_train=a
y_train=b

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
# Specify feature names and categorical features for Xgboost
categorical_indices = ut.get_categorical_indices(
    xgboost_features, categorical_features)

In [None]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_val, label=y_val)


In [None]:
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

In [None]:
d_test = xgb.DMatrix(xgboost_features)

In [None]:
watchlist = [(d_train, 'train'), (d_valid, 'val')]
clf = xgb.train(params, d_train,num_boost_round=5000,evals=watchlist, early_stopping_rounds=100, verbose_eval=10)



In [None]:
result = clf.predict(d_test)


In [None]:
xgb16 = ut.drop_features(prop_2016)
xgb16 = xgb16.fillna(xgb16.mean())
xgb16.replace([np.inf, -np.inf], 0, inplace=True)
xgb16DM = xgb.DMatrix(xgb16)

xgb17 = ut.drop_features(prop_2017)
xgb17 = xgb17.fillna(xgb17.mean())
xgb17.replace([np.inf, -np.inf], 0, inplace=True)
xgb17DM = xgb.DMatrix(xgb17)

In [None]:
result_2016 = clf.predict(xgb16DM)
result_2017 = clf.predict(xgb17DM)

In [None]:
result_2016

In [None]:
sub = pd.read_csv('data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    print(type(c))
    print(sub[c])

In [None]:
file_name = 'submission/final_xgboost_single.csv'
sub = pd.read_csv('data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    if c in ['201610','201611','201612']:
        sub[c] = result_2016
    elif c in ['201710','201711','201712']:
        sub[c] = result_2017

print('Writing csv ...')
sub.to_csv(file_name, index=False, float_format='%.4f') # Thanks to @inversion