## Importing libraries

In [None]:
import gc #garbage collector
import numpy as np
import pandas as pd
import random 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

from helper import utility
import importlib
importlib.reload(utility)

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb


## Load Dataset

In [None]:
# Raw data set provided
prop_2016 = utility.load_data('data/properties_2016.csv')
prop_2017 = utility.load_data('data/properties_2017.csv')
train_2016 = utility.load_data('data/train_2016_v2.csv' , ['transactiondate'])
train_2017 = utility.load_data('data/train_2017.csv', ['transactiondate'])

# Combining the prop dataset with its corresponding train datasets on their parcelid
# left join is used so that all properties without logerror will be ignored
training_2016 = utility.merge_data(train_2016, prop_2016, 'parcelid')
training_2017 = utility.merge_data(train_2017, prop_2017, 'parcelid')



# Data across the 2 years are combined into one data frame for processing at later stages
training_all = pd.concat([training_2016, training_2017] , ignore_index=True)
properties_all = pd.concat([prop_2016, prop_2017], ignore_index=True) 


training_all
#properties_all.shape

## Preprocessing and Prepare for Training

In [None]:
training_cleaned=drop_features(training_all)
training_cleaned

In [None]:
categorical_indexes = get_categorical_indices(training_cleaned)

In [None]:
X = training_cleaned.values
y = (training_all.logerror.astype(np.float32)).values
X_train, X_val, y_train, y_val = prepare_training(X,y)

## Catboost

## Lightgbm

In [None]:
#Lightgbm parameters
params = {}

params['objective'] = 'regression'
params['metric'] = 'mae'
params['num_threads'] = 4  # set to number of real CPU cores for best performance

params['boosting_type'] = 'gbdt'
params['num_boost_round'] = 1250
params['learning_rate'] = 0.003  # shrinkage_rate
params['early_stopping_rounds'] = 30  # Early stopping based on validation set performance

# Control tree growing
params['num_leaves'] = 127  # max number of leaves in one tree (default 31)
params['min_data'] = 150  # min_data_in_leaf
params['min_hessian'] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params['max_depth'] = -1  # limit the max depth of tree model, defult -1 (no limit)
params['max_bin'] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params['sub_feature'] = 0.5    # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params['min_data_per_group'] = 100  # minimal number of data per categorical group (default 100)
params['cat_smooth'] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params['lambda_l1'] = 0.0
params['lambda_l2'] = 0.0

# Random seeds (keep default values)
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

### Train Single Model

In [None]:
#Train Lightgbm
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=training_cleaned)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=training_cleaned)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set,
                valid_sets=[lgb_train_set, lgb_valid_set], valid_names=['train', 'val'],
                categorical_feature=categorical_indexes)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))


In [None]:
# Plot LightGBM feature importance
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False)

In [None]:
# Train LightGBM on all given training data (preparing for submission)
del params['early_stopping_rounds']

lgb_X,lgb_y=remove_outliers(X,y,training_cleaned)

lgb_train_set = lgb.Dataset(lgb_X, label=lgb_y, feature_name=training_cleaned)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indexes)

# Sanity check: make sure the model score is reasonable on a small portion of the data
print("score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
file_name = 'submission/final_lgb_single.csv'
submission, pred_2016, pred_2017 = predict_and_export([model], prop_2016, prop_2017, file_name)

### Train ensemble model 

In [None]:
# Remove outliers (if any) from training data
lgb_X,lgb_y=remove_outliers(X,y,training_cleaned)

lgb_train_set = lgb.Dataset(lgb_X, label=lgb_y, feature_name=training_cleaned)

# Train multiple models
bags = 5
models = []
for i in range(bags):
    print("Start training model {}".format(i))
    params['seed'] = i
    np.random.seed(42)
    random.seed(36)
    model = lgb.train(params, lgb_train_set, categorical_feature=categorical_indexes)
    models.append(model)
    
# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val) - y_val).mean() * 100))

# Save the trained models to disk
save_models(models)

models = load_lightgbm_models(['checkpoints/lgb_' + str(i) for i in range(5)])  # load pretrained models

In [None]:
# Make predictions and export results
file_name = 'submission/final_lgb_ensemble_x5.csv'
submission, pred_2016, pred_2017 = predict_and_export(models, prop_2016, prop_2017, file_name)

## Stacking

In [None]:
lgb_single = pd.read_csv('submission/final_lgb_single.csv')
catboost_x8 = pd.read_csv('submission/final_catboost_ensemble_x8.csv')
print("Finished Loading the prediction results.")

weight = 0.7
stack = pd.DataFrame()
stack['ParcelId'] = lgb_single['ParcelId']
for col in ['201610', '201611', '201612', '201710', '201711', '201712']:
    stack[col] = weight * catboost_x8[col] + (1 - weight) * lgb_single[col]

print(stack.head())
stack.to_csv('submission/final_stack.csv', index=False)