## Importing libraries


In [17]:
import gc  # garbage collector
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pylab

pd.set_option("display.max_columns", None)

import warnings

warnings.filterwarnings("ignore")

from helper import utility as ut
import importlib

importlib.reload(ut)

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb

## Load Dataset - Common for both Models


In [18]:
prop_2016 = ut.load_properties_data("clean_data/prop_2016_clean.csv")
prop_2017 = ut.load_properties_data("clean_data/prop_2017_clean.csv")
train = ut.load_properties_data("clean_data/train_combined.csv")

train.dtypes

parcelid                       int64
logerror                     float64
airconditioningtypeid          int64
architecturalstyletypeid       int64
basementsqft                 float64
                              ...   
avg_area_per_room            float64
derived_avg_area_per_room    float64
year                           int64
month                          int64
quarter                        int64
Length: 71, dtype: object

## Catboost Model


In [19]:
# Dropping columns which do not perform well when we input to the catboost model
catboost_features = ut.drop_features(train)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head(5)

# Prepare feature list for catboost model
categorical_features = [
    "airconditioningtypeid",
    "heatingorsystemtypeid",
    "propertylandusetypeid",
    "year",
    "month",
    "quarter",
    "buildingclasstypeid",
]
for col in catboost_features.columns:
    if col in categorical_features:
        catboost_features[col] = catboost_features[col].astype("str")

Number of features for CatBoost: 55


In [20]:
# Prepare training and cross-validation data
catboost_label = train.logerror.astype(np.float32)
print(catboost_label.head())

# Transform to Numpy matrices
catboost_X = catboost_features.values
catboost_y = catboost_label.values

# Perform shuffled train/test split
X_train, X_val, y_train, y_val = train_test_split(
    catboost_X, catboost_y, test_size=0.2, random_state=99
)
ut.remove_outliers(X_train, y_train)

0    0.0276
1   -0.1684
2   -0.0040
3    0.0218
4   -0.0050
Name: logerror, dtype: float32
new_X: (131462, 55)
new_y: (131462,)


(array([['-1', 2.5, 3.0, ..., '0', '5', '2'],
        ['0', 2.0, 4.0, ..., '1', '6', '2'],
        ['0', 3.0, 4.0, ..., '0', '8', '3'],
        ...,
        ['-1', 3.0, 4.0, ..., '1', '9', '3'],
        ['-1', 2.5, 3.0, ..., '0', '4', '2'],
        ['0', 2.0, 4.0, ..., '1', '8', '3']], dtype=object),
 array([-0.001     , -0.02973739,  0.0315    , ..., -0.00443901,
         0.007     ,  0.02685545], dtype=float32))

In [21]:
# Specify feature names and categorical features for CatBoost
categorical_indices = ut.get_categorical_indices(
    catboost_features, categorical_features
)

[0, 13, 21, 52, 53, 54]


In [22]:
ut.remove_outliers(catboost_X, catboost_y)

model = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    nan_mode="Min",
    random_seed=99,
    iterations=1000,
    learning_rate=0.015,
    border_count=254,
    max_depth=6,
    random_strength=1,
    l2_leaf_reg=5,
    bagging_temperature=1,
    verbose=True,
)
model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=False)

# Sanity check: score on a small portion of the dataset
print("sanity check score: {}".format(abs(model.predict(X_val) - y_val).mean()))

new_X: (164299, 55)
new_y: (164299,)
sanity check score: 0.06857669115730533


In [23]:
file_name = "submission/final_catboost_single.csv"
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(
    [model], prop_2016, prop_2017, file_name
)

Start model 0 (2016)
Start model 0 (2017)
Length of submission DataFrame: 2985217
Submission header:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147 -0.0130 -0.0130 -0.0130 -0.0159 -0.0159 -0.0159
1  10759547 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121
2  10843547  0.0040  0.0040  0.0040  0.0075  0.0075  0.0075
3  10859147  0.0277  0.0277  0.0277  0.0293  0.0293  0.0293
4  10879947  0.0105  0.0105  0.0105  0.0100  0.0100  0.0100


### Catboost with 4x ensemble


In [24]:
# Train multiple models
rounds = 8
models = []
for i in range(rounds):
    print("Start training model {}".format(i))
    model = CatBoostRegressor(
        loss_function="MAE",
        eval_metric="MAE",
        nan_mode="Min",
        random_seed=99,
        iterations=1000,
        learning_rate=0.015,
        border_count=254,
        max_depth=6,
        random_strength=1,
        l2_leaf_reg=5,
        bagging_temperature=1,
        verbose=True,
    )
    model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=True)
    models.append(model)

Start training model 0
0:	learn: 0.0688501	total: 80.4ms	remaining: 1m 20s
1:	learn: 0.0688426	total: 169ms	remaining: 1m 24s
2:	learn: 0.0688354	total: 274ms	remaining: 1m 31s
3:	learn: 0.0688278	total: 351ms	remaining: 1m 27s
4:	learn: 0.0688196	total: 419ms	remaining: 1m 23s
5:	learn: 0.0688108	total: 498ms	remaining: 1m 22s
6:	learn: 0.0688048	total: 618ms	remaining: 1m 27s
7:	learn: 0.0687976	total: 702ms	remaining: 1m 27s
8:	learn: 0.0687910	total: 786ms	remaining: 1m 26s
9:	learn: 0.0687832	total: 876ms	remaining: 1m 26s
10:	learn: 0.0687755	total: 955ms	remaining: 1m 25s
11:	learn: 0.0687703	total: 1.03s	remaining: 1m 25s
12:	learn: 0.0687629	total: 1.12s	remaining: 1m 25s
13:	learn: 0.0687561	total: 1.2s	remaining: 1m 24s
14:	learn: 0.0687495	total: 1.29s	remaining: 1m 24s
15:	learn: 0.0687423	total: 1.36s	remaining: 1m 23s
16:	learn: 0.0687360	total: 1.44s	remaining: 1m 23s
17:	learn: 0.0687310	total: 1.52s	remaining: 1m 22s
18:	learn: 0.0687249	total: 1.6s	remaining: 1m 22s


In [25]:

# Make predictions and export results
file_name = 'submission/final_catboost_ensemble_x4.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(models, prop_2016, prop_2017, file_name)


Start model 0 (2016)
Start model 0 (2017)
Start model 1 (2016)
Start model 1 (2017)
Start model 2 (2016)
Start model 2 (2017)
Start model 3 (2016)
Start model 3 (2017)
Start model 4 (2016)
Start model 4 (2017)
Start model 5 (2016)
Start model 5 (2017)
Start model 6 (2016)
Start model 6 (2017)
Start model 7 (2016)
Start model 7 (2017)
Length of submission DataFrame: 2985217
Submission header:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147 -0.0130 -0.0130 -0.0130 -0.0159 -0.0159 -0.0159
1  10759547 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121
2  10843547  0.0040  0.0040  0.0040  0.0075  0.0075  0.0075
3  10859147  0.0277  0.0277  0.0277  0.0293  0.0293  0.0293
4  10879947  0.0105  0.0105  0.0105  0.0100  0.0100  0.0100


## Lightgbm


In [26]:
# Lightgbm parameters
params = {}

params["objective"] = "regression"
params["metric"] = "mae"
params["num_threads"] = 4  # set to number of real CPU cores for best performance

params["boosting_type"] = "gbdt"
params["num_boost_round"] = 1250
params["learning_rate"] = 0.003  # shrinkage_rate
params[
    "early_stopping_rounds"
] = 30  # Early stopping based on validation set performance

# Control tree growing
params["num_leaves"] = 127  # max number of leaves in one tree (default 31)
params["min_data"] = 150  # min_data_in_leaf
params["min_hessian"] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params["max_depth"] = -1  # limit the max depth of tree model, defult -1 (no limit)
params[
    "max_bin"
] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params[
    "sub_feature"
] = 0.5  # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params["bagging_fraction"] = 0.7
params["bagging_freq"] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params[
    "min_data_per_group"
] = 100  # minimal number of data per categorical group (default 100)
params[
    "cat_smooth"
] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params["lambda_l1"] = 0.0
params["lambda_l2"] = 0.0

# Random seeds (keep default values)
params["feature_fraction_seed"] = 2
params["bagging_seed"] = 3

### Train Single Model


In [27]:
# Train Lightgbm
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=training_cleaned)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=training_cleaned)

np.random.seed(42)
random.seed(36)
model = lgb.train(
    params,
    lgb_train_set,
    valid_sets=[lgb_train_set, lgb_valid_set],
    valid_names=["train", "val"],
    categorical_feature=categorical_indexes,
)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

NameError: name 'training_cleaned' is not defined

In [None]:
# Plot LightGBM feature importance
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False)

In [None]:
# Train LightGBM on all given training data (preparing for submission)
del params["early_stopping_rounds"]

lgb_X, lgb_y = remove_outliers(X, y, training_cleaned)

lgb_train_set = lgb.Dataset(lgb_X, label=lgb_y, feature_name=training_cleaned)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indexes)

# Sanity check: make sure the model score is reasonable on a small portion of the data
print("score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
file_name = "submission/final_lgb_single.csv"
submission, pred_2016, pred_2017 = predict_and_export(
    [model], prop_2016, prop_2017, file_name
)

### Train ensemble model


In [None]:
# Remove outliers (if any) from training data
lgb_X, lgb_y = remove_outliers(X, y, training_cleaned)

lgb_train_set = lgb.Dataset(lgb_X, label=lgb_y, feature_name=training_cleaned)

# Train multiple models
bags = 5
models = []
for i in range(bags):
    print("Start training model {}".format(i))
    params["seed"] = i
    np.random.seed(42)
    random.seed(36)
    model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indexes)
    models.append(model)

# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val) - y_val).mean() * 100))

# Save the trained models to disk
save_models(models)

models = load_lightgbm_models(
    ["checkpoints/lgb_" + str(i) for i in range(5)]
)  # load pretrained models

In [None]:
# Make predictions and export results
file_name = "submission/final_lgb_ensemble_x5.csv"
submission, pred_2016, pred_2017 = predict_and_export(
    models, prop_2016, prop_2017, file_name
)

## Stacking


In [None]:
lgb_single = pd.read_csv("submission/final_lgb_single.csv")
catboost_x8 = pd.read_csv("submission/final_catboost_ensemble_x8.csv")
print("Finished Loading the prediction results.")

weight = 0.7
stack = pd.DataFrame()
stack["ParcelId"] = lgb_single["ParcelId"]
for col in ["201610", "201611", "201612", "201710", "201711", "201712"]:
    stack[col] = weight * catboost_x8[col] + (1 - weight) * lgb_single[col]

print(stack.head())
stack.to_csv("submission/final_stack.csv", index=False)