## Importing libraries


In [27]:
import gc  # garbage collector
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pylab as pylab
import optuna

pd.set_option("display.max_columns", None)

import warnings

warnings.filterwarnings("ignore")

from helper import utility as ut
import importlib

importlib.reload(ut)

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb

## Load Dataset - Common for both Models


In [16]:
prop_2016 = ut.load_properties_data("clean_data/prop_2016_clean.csv")
prop_2017 = ut.load_properties_data("clean_data/prop_2017_clean.csv")
train = ut.load_properties_data("clean_data/train_combined.csv")

train.dtypes

parcelid                       int64
logerror                     float64
airconditioningtypeid          int64
architecturalstyletypeid       int64
basementsqft                 float64
                              ...   
avg_area_per_room            float64
derived_avg_area_per_room    float64
year                           int64
month                          int64
quarter                        int64
Length: 71, dtype: object

## Catboost Model


In [17]:
# Dropping columns which do not perform well when we input to the catboost model
catboost_features = ut.drop_features(train)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head(5)

# Prepare feature list for catboost model
categorical_features = [
    "airconditioningtypeid",
    "heatingorsystemtypeid",
    "propertylandusetypeid",
    "year",
    "month",
    "quarter",
    "buildingclasstypeid",
]
for col in catboost_features.columns:
    if col in categorical_features:
        catboost_features[col] = catboost_features[col].astype("str")

Number of features for CatBoost: 55


In [18]:
# Prepare training and cross-validation data
catboost_label = train.logerror.astype(np.float32)
print(catboost_label.head())

# Transform to Numpy matrices
catboost_X = catboost_features.values
catboost_y = catboost_label.values

# Perform shuffled train/test split
X_train, X_val, y_train, y_val = train_test_split(
    catboost_X, catboost_y, test_size=0.2, random_state=99
)
ut.remove_outliers(X_train, y_train)

0    0.0276
1   -0.1684
2   -0.0040
3    0.0218
4   -0.0050
Name: logerror, dtype: float32
new_X: (131462, 55)
new_y: (131462,)


(array([['-1', 2.5, 3.0, ..., '0', '5', '2'],
        ['0', 2.0, 4.0, ..., '1', '6', '2'],
        ['0', 3.0, 4.0, ..., '0', '8', '3'],
        ...,
        ['-1', 3.0, 4.0, ..., '1', '9', '3'],
        ['-1', 2.5, 3.0, ..., '0', '4', '2'],
        ['0', 2.0, 4.0, ..., '1', '8', '3']], dtype=object),
 array([-0.001     , -0.02973739,  0.0315    , ..., -0.00443901,
         0.007     ,  0.02685545], dtype=float32))

In [19]:
# Specify feature names and categorical features for CatBoost
categorical_indices = ut.get_categorical_indices(
    catboost_features, categorical_features
)

[0, 13, 21, 52, 53, 54]


In [20]:
n = [i * 0.005 for i in range(1,int(1/0.005) + 1)]

# Printing the generated list
print(len(n))

200


In [21]:
ut.remove_outliers(catboost_X, catboost_y)


model = CatBoostRegressor(
        loss_function="MAE",
        eval_metric="MAE",
        nan_mode="Min",
        random_seed=99,
        iterations=1000,
        learning_rate=0.015,
        border_count=254,
        max_depth=6,
        random_strength=1,
        l2_leaf_reg=5,
        bagging_temperature=1,
        verbose=True,
    )
model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=False)

# Sanity check: score on a small portion of the dataset
print("sanity check score: {}".format(abs(model.predict(X_val) - y_val).mean()))


new_X: (164299, 55)
new_y: (164299,)
sanity check score: 0.06857669115730533


In [22]:
file_name = "submission/final_catboost_single.csv"
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(
    [model], prop_2016, prop_2017, file_name
)

Start model 0 (2016)
Start model 0 (2017)
Length of submission DataFrame: 2985217
Submission header:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147 -0.0130 -0.0130 -0.0130 -0.0159 -0.0159 -0.0159
1  10759547 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121 -0.0121
2  10843547  0.0040  0.0040  0.0040  0.0075  0.0075  0.0075
3  10859147  0.0277  0.0277  0.0277  0.0293  0.0293  0.0293
4  10879947  0.0105  0.0105  0.0105  0.0100  0.0100  0.0100


### Optuna for CatBoost Regressor

In [28]:
def objective(trial):
    # Define hyperparameters to be tuned
    params = {
        'iterations': trial.suggest_int('iterations', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'depth': trial.suggest_int('depth', 4, 10),
    }

    model = CatBoostRegressor(**params, random_state=99, verbose=0)


    model.fit(X_train, y_train)

    predictions = model.predict(X_val)

    mse = mean_squared_error(y_val, predictions)

    print(f"Iteration {objective.iteration}: MSE = {mse}")
    objective.iteration += 1

    return mse


objective.iteration = 1
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Best Hyperparameters:")
print(best_params)


[I 2023-11-17 15:50:33,703] A new study created in memory with name: no-name-d9aeb5b6-c00d-4d02-8bce-97c7f782c6fb
[I 2023-11-17 15:50:37,116] Trial 0 finished with value: 0.028460523013205168 and parameters: {'iterations': 222, 'learning_rate': 0.013833811467916934, 'depth': 9, 'l2_leaf_reg': 23.933481820332084, 'border_count': 34}. Best is trial 0 with value: 0.028460523013205168.


Iteration 1: MSE = 0.028460523013205168


[I 2023-11-17 15:50:38,617] Trial 1 finished with value: 0.028703450130051607 and parameters: {'iterations': 56, 'learning_rate': 0.018497443241885625, 'depth': 5, 'l2_leaf_reg': 60.21798415887893, 'border_count': 184}. Best is trial 0 with value: 0.028460523013205168.


Iteration 2: MSE = 0.028703450130051607


[I 2023-11-17 15:50:42,839] Trial 2 finished with value: 0.02810377943391018 and parameters: {'iterations': 193, 'learning_rate': 0.1989292856510661, 'depth': 9, 'l2_leaf_reg': 3.973009129260253, 'border_count': 174}. Best is trial 2 with value: 0.02810377943391018.


Iteration 3: MSE = 0.02810377943391018


[I 2023-11-17 15:51:14,983] Trial 3 finished with value: 0.028035724909303308 and parameters: {'iterations': 637, 'learning_rate': 0.1378795807413843, 'depth': 10, 'l2_leaf_reg': 66.30510178697193, 'border_count': 215}. Best is trial 3 with value: 0.028035724909303308.


Iteration 4: MSE = 0.028035724909303308


[I 2023-11-17 15:51:24,557] Trial 4 finished with value: 0.028598232366243304 and parameters: {'iterations': 260, 'learning_rate': 0.16254092269459602, 'depth': 9, 'l2_leaf_reg': 0.3843070737407729, 'border_count': 199}. Best is trial 3 with value: 0.028035724909303308.


Iteration 5: MSE = 0.028598232366243304


[I 2023-11-17 15:52:03,084] Trial 5 finished with value: 0.028098251860723428 and parameters: {'iterations': 812, 'learning_rate': 0.1631927256436783, 'depth': 8, 'l2_leaf_reg': 87.95385730490297, 'border_count': 47}. Best is trial 3 with value: 0.028035724909303308.


Iteration 6: MSE = 0.028098251860723428


[I 2023-11-17 15:52:39,805] Trial 6 finished with value: 0.028024589995816935 and parameters: {'iterations': 941, 'learning_rate': 0.11746060397489776, 'depth': 6, 'l2_leaf_reg': 91.00769890378733, 'border_count': 180}. Best is trial 6 with value: 0.028024589995816935.


Iteration 7: MSE = 0.028024589995816935


[I 2023-11-17 15:53:08,600] Trial 7 finished with value: 0.028245610323083103 and parameters: {'iterations': 404, 'learning_rate': 0.04780748689666327, 'depth': 8, 'l2_leaf_reg': 73.16120850393627, 'border_count': 175}. Best is trial 6 with value: 0.028024589995816935.


Iteration 8: MSE = 0.028245610323083103


[I 2023-11-17 15:53:25,679] Trial 8 finished with value: 0.028390843669591784 and parameters: {'iterations': 633, 'learning_rate': 0.019980544195989304, 'depth': 6, 'l2_leaf_reg': 86.48817469091452, 'border_count': 69}. Best is trial 6 with value: 0.028024589995816935.


Iteration 9: MSE = 0.028390843669591784


[I 2023-11-17 15:53:48,226] Trial 9 finished with value: 0.028028426647290043 and parameters: {'iterations': 215, 'learning_rate': 0.14635755927730512, 'depth': 10, 'l2_leaf_reg': 23.19419378235284, 'border_count': 131}. Best is trial 6 with value: 0.028024589995816935.


Iteration 10: MSE = 0.028028426647290043


[I 2023-11-17 15:54:10,956] Trial 10 finished with value: 0.028171461494876333 and parameters: {'iterations': 969, 'learning_rate': 0.08377055716765007, 'depth': 5, 'l2_leaf_reg': 99.53521058229754, 'border_count': 247}. Best is trial 6 with value: 0.028024589995816935.


Iteration 11: MSE = 0.028171461494876333


[I 2023-11-17 15:54:25,737] Trial 11 finished with value: 0.0280864780538058 and parameters: {'iterations': 421, 'learning_rate': 0.10780839901595604, 'depth': 7, 'l2_leaf_reg': 41.022188435035524, 'border_count': 113}. Best is trial 6 with value: 0.028024589995816935.


Iteration 12: MSE = 0.0280864780538058


[I 2023-11-17 15:54:44,270] Trial 12 finished with value: 0.028100373052852057 and parameters: {'iterations': 928, 'learning_rate': 0.11250535703146926, 'depth': 4, 'l2_leaf_reg': 44.15396958072259, 'border_count': 134}. Best is trial 6 with value: 0.028024589995816935.


Iteration 13: MSE = 0.028100373052852057


[I 2023-11-17 15:55:04,461] Trial 13 finished with value: 0.02806874625955379 and parameters: {'iterations': 633, 'learning_rate': 0.07588808358509005, 'depth': 7, 'l2_leaf_reg': 28.113008903184905, 'border_count': 102}. Best is trial 6 with value: 0.028024589995816935.


Iteration 14: MSE = 0.02806874625955379


[I 2023-11-17 15:55:09,032] Trial 14 finished with value: 0.028659620892544122 and parameters: {'iterations': 14, 'learning_rate': 0.13145471432088796, 'depth': 6, 'l2_leaf_reg': 54.710837605088095, 'border_count': 150}. Best is trial 6 with value: 0.028024589995816935.


Iteration 15: MSE = 0.028659620892544122


[I 2023-11-17 15:56:55,309] Trial 15 finished with value: 0.02806790371327651 and parameters: {'iterations': 783, 'learning_rate': 0.08053469268372235, 'depth': 10, 'l2_leaf_reg': 72.59076268008468, 'border_count': 143}. Best is trial 6 with value: 0.028024589995816935.


Iteration 16: MSE = 0.02806790371327651


[I 2023-11-17 15:57:15,030] Trial 16 finished with value: 0.028090712869679676 and parameters: {'iterations': 509, 'learning_rate': 0.14630644180411978, 'depth': 6, 'l2_leaf_reg': 33.07100006634849, 'border_count': 99}. Best is trial 6 with value: 0.028024589995816935.


Iteration 17: MSE = 0.028090712869679676


[I 2023-11-17 15:57:27,702] Trial 17 finished with value: 0.02814101527790735 and parameters: {'iterations': 349, 'learning_rate': 0.12102746616920224, 'depth': 4, 'l2_leaf_reg': 16.555840929452224, 'border_count': 220}. Best is trial 6 with value: 0.028024589995816935.


Iteration 18: MSE = 0.02814101527790735


[I 2023-11-17 15:58:12,181] Trial 18 finished with value: 0.028081083753572968 and parameters: {'iterations': 548, 'learning_rate': 0.09729932860683857, 'depth': 8, 'l2_leaf_reg': 48.05961227600944, 'border_count': 251}. Best is trial 6 with value: 0.028024589995816935.


Iteration 19: MSE = 0.028081083753572968


[I 2023-11-17 15:58:20,403] Trial 19 finished with value: 0.02832961846666891 and parameters: {'iterations': 136, 'learning_rate': 0.15930866451641415, 'depth': 5, 'l2_leaf_reg': 37.70158539677041, 'border_count': 158}. Best is trial 6 with value: 0.028024589995816935.


Iteration 20: MSE = 0.02832961846666891


[I 2023-11-17 15:58:36,532] Trial 20 finished with value: 0.02812007551605706 and parameters: {'iterations': 321, 'learning_rate': 0.12306353459551349, 'depth': 7, 'l2_leaf_reg': 49.93907265723219, 'border_count': 125}. Best is trial 6 with value: 0.028024589995816935.


Iteration 21: MSE = 0.02812007551605706


[I 2023-11-17 16:01:23,034] Trial 21 finished with value: 0.02809544610507777 and parameters: {'iterations': 794, 'learning_rate': 0.13847797266997722, 'depth': 10, 'l2_leaf_reg': 59.2830324775364, 'border_count': 213}. Best is trial 6 with value: 0.028024589995816935.


Iteration 22: MSE = 0.02809544610507777


[I 2023-11-17 16:02:04,866] Trial 22 finished with value: 0.028074946626427257 and parameters: {'iterations': 636, 'learning_rate': 0.1433890708566965, 'depth': 10, 'l2_leaf_reg': 66.12024885810715, 'border_count': 227}. Best is trial 6 with value: 0.028024589995816935.


Iteration 23: MSE = 0.028074946626427257


[I 2023-11-17 16:02:46,619] Trial 23 finished with value: 0.028022388899216034 and parameters: {'iterations': 866, 'learning_rate': 0.12398893893868002, 'depth': 10, 'l2_leaf_reg': 80.11420632515885, 'border_count': 186}. Best is trial 23 with value: 0.028022388899216034.


Iteration 24: MSE = 0.028022388899216034


[I 2023-11-17 16:03:03,751] Trial 24 finished with value: 0.028068836027293823 and parameters: {'iterations': 897, 'learning_rate': 0.09943235019631867, 'depth': 9, 'l2_leaf_reg': 82.76936686222247, 'border_count': 192}. Best is trial 23 with value: 0.028022388899216034.


Iteration 25: MSE = 0.028068836027293823


[I 2023-11-17 16:03:16,487] Trial 25 finished with value: 0.028072423131694828 and parameters: {'iterations': 996, 'learning_rate': 0.1194787998408225, 'depth': 8, 'l2_leaf_reg': 98.92846143863521, 'border_count': 166}. Best is trial 23 with value: 0.028022388899216034.


Iteration 26: MSE = 0.028072423131694828


[I 2023-11-17 16:03:44,369] Trial 26 finished with value: 0.028105245366155395 and parameters: {'iterations': 869, 'learning_rate': 0.17808912427301085, 'depth': 10, 'l2_leaf_reg': 80.95351565536532, 'border_count': 80}. Best is trial 23 with value: 0.028022388899216034.


Iteration 27: MSE = 0.028105245366155395


[I 2023-11-17 16:03:59,043] Trial 27 finished with value: 0.02806399731746235 and parameters: {'iterations': 733, 'learning_rate': 0.13015620709790165, 'depth': 9, 'l2_leaf_reg': 92.18145076171116, 'border_count': 198}. Best is trial 23 with value: 0.028022388899216034.


Iteration 28: MSE = 0.02806399731746235


[I 2023-11-17 16:04:07,861] Trial 28 finished with value: 0.02806226773304836 and parameters: {'iterations': 868, 'learning_rate': 0.10995121739284656, 'depth': 7, 'l2_leaf_reg': 77.63213135952446, 'border_count': 155}. Best is trial 23 with value: 0.028022388899216034.


Iteration 29: MSE = 0.02806226773304836


[I 2023-11-17 16:04:19,474] Trial 29 finished with value: 0.028086931593874757 and parameters: {'iterations': 701, 'learning_rate': 0.1465359107310636, 'depth': 9, 'l2_leaf_reg': 92.37041443352965, 'border_count': 127}. Best is trial 23 with value: 0.028022388899216034.


Iteration 30: MSE = 0.028086931593874757


[I 2023-11-17 16:04:21,951] Trial 30 finished with value: 0.028219564347506634 and parameters: {'iterations': 157, 'learning_rate': 0.12754302862022127, 'depth': 6, 'l2_leaf_reg': 19.38021875136853, 'border_count': 232}. Best is trial 23 with value: 0.028022388899216034.


Iteration 31: MSE = 0.028219564347506634


[I 2023-11-17 16:04:50,536] Trial 31 finished with value: 0.02809261862087938 and parameters: {'iterations': 552, 'learning_rate': 0.13862601950476502, 'depth': 10, 'l2_leaf_reg': 67.85930686285424, 'border_count': 208}. Best is trial 23 with value: 0.028022388899216034.


Iteration 32: MSE = 0.02809261862087938


[I 2023-11-17 16:05:22,943] Trial 32 finished with value: 0.028045577809359866 and parameters: {'iterations': 694, 'learning_rate': 0.15505463845291095, 'depth': 10, 'l2_leaf_reg': 77.10587218157261, 'border_count': 179}. Best is trial 23 with value: 0.028022388899216034.


Iteration 33: MSE = 0.028045577809359866


[I 2023-11-17 16:06:08,039] Trial 33 finished with value: 0.028071513847899306 and parameters: {'iterations': 943, 'learning_rate': 0.17466138145750693, 'depth': 10, 'l2_leaf_reg': 71.56319238309695, 'border_count': 186}. Best is trial 23 with value: 0.028022388899216034.


Iteration 34: MSE = 0.028071513847899306


[I 2023-11-17 16:06:26,193] Trial 34 finished with value: 0.0280611597509266 and parameters: {'iterations': 847, 'learning_rate': 0.13364859794327064, 'depth': 9, 'l2_leaf_reg': 62.54289645568916, 'border_count': 238}. Best is trial 23 with value: 0.028022388899216034.


Iteration 35: MSE = 0.0280611597509266


[I 2023-11-17 16:06:31,823] Trial 35 finished with value: 0.028152192581449884 and parameters: {'iterations': 236, 'learning_rate': 0.15124711131687743, 'depth': 9, 'l2_leaf_reg': 55.550118810411334, 'border_count': 203}. Best is trial 23 with value: 0.028022388899216034.


Iteration 36: MSE = 0.028152192581449884


[I 2023-11-17 16:06:33,697] Trial 36 finished with value: 0.028251935468251287 and parameters: {'iterations': 101, 'learning_rate': 0.1167687490112073, 'depth': 5, 'l2_leaf_reg': 8.287492126783583, 'border_count': 165}. Best is trial 23 with value: 0.028022388899216034.


Iteration 37: MSE = 0.028251935468251287


[I 2023-11-17 16:06:40,117] Trial 37 finished with value: 0.028045181914364076 and parameters: {'iterations': 447, 'learning_rate': 0.16929659942509, 'depth': 8, 'l2_leaf_reg': 83.83984165980246, 'border_count': 174}. Best is trial 23 with value: 0.028022388899216034.


Iteration 38: MSE = 0.028045181914364076


[I 2023-11-17 16:07:11,196] Trial 38 finished with value: 0.028120084960247784 and parameters: {'iterations': 769, 'learning_rate': 0.186176211162102, 'depth': 10, 'l2_leaf_reg': 76.90433360390838, 'border_count': 139}. Best is trial 23 with value: 0.028022388899216034.


Iteration 39: MSE = 0.028120084960247784


[I 2023-11-17 16:07:17,918] Trial 39 finished with value: 0.02810200224908139 and parameters: {'iterations': 318, 'learning_rate': 0.1594232897966545, 'depth': 9, 'l2_leaf_reg': 88.24573192113932, 'border_count': 190}. Best is trial 23 with value: 0.028022388899216034.


Iteration 40: MSE = 0.02810200224908139


[I 2023-11-17 16:07:26,687] Trial 40 finished with value: 0.02804298030884857 and parameters: {'iterations': 1000, 'learning_rate': 0.15134092383765624, 'depth': 6, 'l2_leaf_reg': 67.02206308466549, 'border_count': 217}. Best is trial 23 with value: 0.028022388899216034.


Iteration 41: MSE = 0.02804298030884857


[I 2023-11-17 16:07:35,548] Trial 41 finished with value: 0.028092814094061166 and parameters: {'iterations': 996, 'learning_rate': 0.1378072251631479, 'depth': 6, 'l2_leaf_reg': 67.38168774132, 'border_count': 223}. Best is trial 23 with value: 0.028022388899216034.


Iteration 42: MSE = 0.028092814094061166


[I 2023-11-17 16:07:42,665] Trial 42 finished with value: 0.028099761203931307 and parameters: {'iterations': 930, 'learning_rate': 0.15223259114780985, 'depth': 5, 'l2_leaf_reg': 62.91179241302973, 'border_count': 213}. Best is trial 23 with value: 0.028022388899216034.


Iteration 43: MSE = 0.028099761203931307


[I 2023-11-17 16:07:50,481] Trial 43 finished with value: 0.02815045554792436 and parameters: {'iterations': 844, 'learning_rate': 0.12475773573060331, 'depth': 6, 'l2_leaf_reg': 70.09267183669577, 'border_count': 244}. Best is trial 23 with value: 0.028022388899216034.


Iteration 44: MSE = 0.02815045554792436


[I 2023-11-17 16:08:00,124] Trial 44 finished with value: 0.0280779320272628 and parameters: {'iterations': 907, 'learning_rate': 0.16519347122334757, 'depth': 7, 'l2_leaf_reg': 74.4605148133258, 'border_count': 202}. Best is trial 23 with value: 0.028022388899216034.


Iteration 45: MSE = 0.0280779320272628


[I 2023-11-17 16:08:08,468] Trial 45 finished with value: 0.028087459567616727 and parameters: {'iterations': 958, 'learning_rate': 0.14320565826054815, 'depth': 6, 'l2_leaf_reg': 57.416710870891066, 'border_count': 172}. Best is trial 23 with value: 0.028022388899216034.


Iteration 46: MSE = 0.028087459567616727


[I 2023-11-17 16:08:14,039] Trial 46 finished with value: 0.028159644571848115 and parameters: {'iterations': 588, 'learning_rate': 0.11331015545070744, 'depth': 7, 'l2_leaf_reg': 53.32135298410331, 'border_count': 46}. Best is trial 23 with value: 0.028022388899216034.


Iteration 47: MSE = 0.028159644571848115


[I 2023-11-17 16:08:21,235] Trial 47 finished with value: 0.02808365575469817 and parameters: {'iterations': 473, 'learning_rate': 0.13114198158700346, 'depth': 8, 'l2_leaf_reg': 62.52048663836281, 'border_count': 218}. Best is trial 23 with value: 0.028022388899216034.


Iteration 48: MSE = 0.02808365575469817


[I 2023-11-17 16:08:26,663] Trial 48 finished with value: 0.028107633000633184 and parameters: {'iterations': 821, 'learning_rate': 0.10466350976537353, 'depth': 4, 'l2_leaf_reg': 80.33277618296705, 'border_count': 113}. Best is trial 23 with value: 0.028022388899216034.


Iteration 49: MSE = 0.028107633000633184


[I 2023-11-17 16:08:32,640] Trial 49 finished with value: 0.02808517140335897 and parameters: {'iterations': 739, 'learning_rate': 0.1472416360702783, 'depth': 5, 'l2_leaf_reg': 45.33447611896309, 'border_count': 235}. Best is trial 23 with value: 0.028022388899216034.


Iteration 50: MSE = 0.02808517140335897
Best Hyperparameters:
{'iterations': 866, 'learning_rate': 0.12398893893868002, 'depth': 10, 'l2_leaf_reg': 80.11420632515885, 'border_count': 186}


### Catboost with 4x ensemble


In [23]:
# Train multiple models
rounds = 8
models = []
for i in range(rounds):
    print("Start training model {}".format(i))
    model = CatBoostRegressor(
        loss_function="MAE",
        eval_metric="MAE",
        nan_mode="Min",
        random_seed=99,
        iterations=1000,
        learning_rate=0.015,
        border_count=254,
        max_depth=6,
        random_strength=1,
        l2_leaf_reg=5,
        bagging_temperature=1,
        verbose=True,
    )
    model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=True)
    models.append(model)

Start training model 0
0:	learn: 0.0688501	total: 101ms	remaining: 1m 40s
1:	learn: 0.0688426	total: 196ms	remaining: 1m 37s
2:	learn: 0.0688354	total: 286ms	remaining: 1m 34s
3:	learn: 0.0688278	total: 373ms	remaining: 1m 32s
4:	learn: 0.0688196	total: 472ms	remaining: 1m 33s
5:	learn: 0.0688108	total: 551ms	remaining: 1m 31s
6:	learn: 0.0688048	total: 651ms	remaining: 1m 32s
7:	learn: 0.0687976	total: 738ms	remaining: 1m 31s
8:	learn: 0.0687910	total: 833ms	remaining: 1m 31s
9:	learn: 0.0687832	total: 930ms	remaining: 1m 32s
10:	learn: 0.0687755	total: 1.02s	remaining: 1m 32s
11:	learn: 0.0687703	total: 1.13s	remaining: 1m 32s
12:	learn: 0.0687629	total: 1.23s	remaining: 1m 33s
13:	learn: 0.0687561	total: 1.33s	remaining: 1m 33s
14:	learn: 0.0687495	total: 1.43s	remaining: 1m 34s
15:	learn: 0.0687423	total: 1.53s	remaining: 1m 34s
16:	learn: 0.0687360	total: 1.64s	remaining: 1m 34s
17:	learn: 0.0687310	total: 1.76s	remaining: 1m 35s
18:	learn: 0.0687249	total: 1.92s	remaining: 1m 39s

KeyboardInterrupt: 

In [None]:
# Make predictions and export results
file_name = 'submission/final_catboost_ensemble_x4.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(models, prop_2016, prop_2017, file_name)

## Lightgbm Model


In [None]:
#Drop not useful columns
lightgbm_features = ut.drop_features(train)
print("Number of features for Lightgbm: {}".format(len(lightgbm_features.columns)))
lightgbm_features.head(5)

In [None]:
# Prepare training and cross-validation data
lightgbm_label = train.logerror.astype(np.float32)
print(lightgbm_label.head())

# Transform to Numpy matrices
lightgbm_X = lightgbm_features.values
lightgbm_y = lightgbm_label.values

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(lightgbm_X, lightgbm_y, test_size=0.2 , random_state=99)

a,b=ut.remove_outliers(X_train, y_train)
X_train=a
y_train=b

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
# Specify feature names and categorical features for Lightgbm
categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusetypeid', 'year', 'month', 'quarter','buildingclasstypeid']
categorical_indices = ut.get_categorical_indices(lightgbm_features, categorical_features)

In [None]:
# Lightgbm parameters
params = {}

params["objective"] = "regression"
params["metric"] = "mae"
params["num_threads"] = 4  # set to number of real CPU cores for best performance

params["boosting_type"] = "gbdt"
params["num_boost_round"] = 1250
params["learning_rate"] = 0.003  # shrinkage_rate 
# params["early_stopping_rounds"] = 30  # Early stopping based on validation set performance 

# Control tree growing
params["num_leaves"] = 127  # max number of leaves in one tree (default 31)
params["min_data"] = 150  # min_data_in_leaf
params["min_hessian"] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params["max_depth"] = -1  # limit the max depth of tree model, defult -1 (no limit)
params[
    "max_bin"
] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params[
    "sub_feature"
] = 0.5  # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params["bagging_fraction"] = 0.7
params["bagging_freq"] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params[
    "min_data_per_group"
] = 100  # minimal number of data per categorical group (default 100)
params[
    "cat_smooth"
] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params["lambda_l1"] = 0.0
params["lambda_l2"] = 0.0

# Random seeds (keep default values)
params["feature_fraction_seed"] = 2
params["bagging_seed"] = 3

### Lightgbm Single Model


In [None]:
#Train Lightgbm
feature_names = [s for s in lightgbm_features.columns]
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=feature_names)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set,
                valid_sets=[lgb_train_set, lgb_valid_set], valid_names=['train', 'val'],
                categorical_feature=categorical_indices)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
# Plot LightGBM feature importance
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False)

In [None]:
# Train LightGBM on all given training data (preparing for submission)
#del params['early_stopping_rounds']

a,b=ut.remove_outliers(lightgbm_X,lightgbm_y)
lightgbm_X=a
lightgbm_y=b

lgb_train_set = lgb.Dataset(lightgbm_X, label=lightgbm_y, feature_name=feature_names)
print("lightgbm_X: {}".format(lightgbm_X.shape))
print("lightgbm_y: {}".format(lightgbm_y.shape))

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indices)

# Sanity check: make sure the model score is reasonable on a small portion of the data
print("score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
file_name = 'submission/final_lgb_single.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv([model], prop_2016, prop_2017, file_name)

### Lightgbm Ensemble 5x Model


In [None]:
a,b=ut.remove_outliers(lightgbm_X,lightgbm_y)
lightgbm_X=a
lightgbm_y=b

lgb_train_set = lgb.Dataset(lightgbm_X, label=lightgbm_y, feature_name=feature_names)

# Train multiple models
bags = 5
models = []
for i in range(bags):
    print("Start training model {}".format(i))
    params["seed"] = i
    np.random.seed(42)
    random.seed(36)
    model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indices)
    models.append(model)

# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val) - y_val).mean() * 100))

# Save the trained models to disk
ut.save_models(models,'lightgbm')

models = ut.load_lightgbm_models(['checkpoints/lightgbm_' + str(i) for i in range(bags)])  # load pretrained models 

In [None]:
# Make predictions and export results
file_name = 'submission/final_lgb_ensemble_x5.csv'
submission, pred_2016, pred_2017 = ut.predict_and_generate_csv(models, prop_2016, prop_2017, file_name)

## Stacking


In [None]:
lgb_single = pd.read_csv('submission/final_lgb_single.csv')
catboost_single = pd.read_csv('submission/final_catboost_single.csv')
print("Finished Loading the prediction results.")

def decimal_range(start, stop, increment):
    while start <= stop: 
        yield start
        start += increment

for weight in decimal_range(0.1, 0.9, 0.1):
    #weight = 0.7
    stack = pd.DataFrame()
    stack["ParcelId"] = lgb_single["ParcelId"]
    for col in ["201610", "201611", "201612", "201710", "201711", "201712"]:
        stack[col] = weight * catboost_single[col] + (1 - weight) * lgb_single[col]

    print(stack.head())
    #stack.to_csv("submission/final_stack.csv", index=False)
    stack.to_csv("submission/final_stack_catboostweight_" + str("{:.1f}".format(weight)) + ".csv", index=False)