### Importing Libraries

In [None]:
import gc #garbage collector
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import random 
import lightgbm as lgb

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

from helper import utility
import importlib
importlib.reload(utility)


### Importing Zillow Datasets

In [None]:
# Raw data set provided
prop_2016 = utility.load_data('data/properties_2016.csv')
prop_2017 = utility.load_data('data/properties_2017.csv')
train_2016 = utility.load_data('data/train_2016_v2.csv' , ['transactiondate'])
train_2017 = utility.load_data('data/train_2017.csv', ['transactiondate'])

In [None]:
# Combining the prop dataset with its corresponding train datasets on their parcelid
# left join is used so that all properties without logerror will be ignored
training_2016 = utility.merge_data(train_2016, prop_2016, 'parcelid')
training_2017 = utility.merge_data(train_2017, prop_2017, 'parcelid')



# Data across the 2 years are combined into one data frame for processing at later stages
training_all = pd.concat([training_2016, training_2017] , ignore_index=True)
properties_all = pd.concat([prop_2016, prop_2017], ignore_index=True) 


training_all
#properties_all.shape


In [None]:
# Check and Drop any duplicates in the training dataset
# Duplicates are those which have the same parcelid and transactiondate

training_all.shape
utility.check_duplicates(training_all)
training_all = utility.drop_dups(training_all)

In [None]:
# We can see that there are no duplicates in the dataset so far
training_all.shape

### Looking at the Target Variable - logerror

In [None]:
target_y = training_all['logerror']


target_y.hist(bins=50, figsize=(8,4))
plt.show()

In [None]:
target_y.describe()

In [None]:
# Drop outliers that are more than 2.5 std away from mean
upper_threshold = target_y.mean() + (2.5*target_y.std())
lower_threshold = target_y.mean() - (2.5*target_y.std())


# Remove data that have their target y value as outliers
training_all = training_all[training_all['logerror'] < upper_threshold]
training_all = training_all[training_all['logerror'] > lower_threshold]
training_all.shape



### Data Preprocessing & Feature Engineering

In [None]:
# Adding Feature to the dataset
# Add Day, Month, Year and which quarter the transaction was done
training_all = utility.add_dmy_feature(training_all)
training_all

In [None]:
utility.print_percent_missing(training_all)

In [None]:
# Drop all columns that have missing threashold greater than 95%
MISSING_THRESHOLD = 0.97
col_to_drop = utility.get_col_to_drop_missing(training_all, 0.95)
col_to_drop += utility.get_col_to_drop_non_unique(training_all)

# Other columns to exlude to prepare for training dataset
exclude_list = ["parcelid" , "logerror" , 'propertyzoningdesc']

remaining_col = []
for col in training_all.columns:
    if col not in col_to_drop and col not in exclude_list:
        remaining_col.append(col)
        print(col)

print('Number of columns left:', len(remaining_col))

In [None]:
# Dealing with Categorical Values
# Convert categorical values to 'category' type for some columns

category_list= ['heatingorsystemtypeid','propertylandusetypeid', 'storytypeid', 
               'airconditioningtypeid', 'architecturalstyletypeid','typeconstructiontypeid'
                'buildingclasstypeid', 'quarter', 'day', 'transaction_year', 'transaction_day']
for col in training_all.columns:
    if col in category_list: 
        utility.float_to_categorical(training_all, col)


In [None]:
# Convert float64 values to float32 values 
# for col in training_all.columns: 
#     if training_all[col].dtype.name == 'float64': 
#         training_all[col] = training_all[col].astype('float32')

In [None]:
training_all.dtypes

### Training models

In [None]:
# Drop unecessary columns, only keep the remaining columns 
training_cleaned=training_all

for col in training_all: 
    if col not in remaining_col: 
        training_cleaned=training_cleaned.drop([col], axis=1)

training_cleaned.head()
# for col in training_cleaned: 
#     training_cleaned[col] = training_cleaned[col].astype('category')
# training_cleaned.dtypes

In [None]:
category_index = []
for i,col in enumerate(training_cleaned.columns): 
    if col in category_list: 
        category_index.append(i)
print(category_index)

In [None]:
# Saving LightGBM models
def save_models(models):
    for i, model in enumerate(models):
        model.save_model('checkpoints/lgb_' + str(i))
    print("Saved {} LightGBM models to files.".format(len(models)))

# Load LightGBM models from files
def load_models(paths):
    models = []
    for path in paths:
        model = lgb.Booster(model_file=path)
        models.append(model)
    return models

In [None]:
lgb_label = training_all.logerror
lgb_label.head()

from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
lgb_X=ord_enc.fit_transform(training_cleaned)
#Transform to Numpy matrices 
#lgb_X = training_cleaned.values
lgb_y = lgb_label.values

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(lgb_X, lgb_y, test_size=0.2)

# Remove outlier examples from X_train and y_train; Keep them in X_val and y_val for proper cross-validation
outlier_threshold = 0.4
mask = (abs(y_train) <= outlier_threshold)
X_train = X_train[mask, :]
y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
params = {}

params['objective'] = 'regression'
params['metric'] = 'mae'
params['num_threads'] = 4  # set to number of real CPU cores for best performance

params['boosting_type'] = 'gbdt'
params['num_boost_round'] = 500
params['learning_rate'] = 0.003  # shrinkage_rate
#params['early_stopping_rounds'] = 30  # Early stopping based on validation set performance

# Control tree growing
params['num_leaves'] = 127  # max number of leaves in one tree (default 31)
params['min_data'] = 150  # min_data_in_leaf
params['min_hessian'] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params['max_depth'] = -1  # limit the max depth of tree model, defult -1 (no limit)
params['max_bin'] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params['sub_feature'] = 0.5    # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params['min_data_per_group'] = 100  # minimal number of data per categorical group (default 100)
params['cat_smooth'] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params['lambda_l1'] = 0.0
params['lambda_l2'] = 0.0

# Random seeds (keep default values)
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [None]:
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=remaining_col)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=remaining_col)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set,
                valid_sets=[lgb_train_set, lgb_valid_set], valid_names=['train', 'val'],
                categorical_feature=category_index)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False)

In [None]:
# Sanity check: make sure the model score is reasonable on a small portion of the data
print("score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

In [None]:
features_2016=utility.load_data('data/properties_2016.csv')
features_2017=utility.load_data('data/properties_2017.csv')

# Other columns to exlude to prepare for training dataset
exclude_list = ["logerror" , 'propertyzoningdesc']

remaining_col = []
for col in training_all.columns:
    if col not in col_to_drop and col not in exclude_list:
        remaining_col.append(col)

for col in features_2016.columns:
    if col not in remaining_col:
        features_2016 = features_2016.drop([col], axis=1)

for col in features_2017.columns:
    if col not in remaining_col:
        features_2017 = features_2017.drop([col], axis=1)


badfeatures = ['hashottuborspa', 'propertycountylandusecode', 'propertyzoningdesc', 'fireplaceflag', 'taxdelinquencyflag']

for col in features_2016.columns:
    if col in badfeatures:
        features_2016 = features_2016.drop([col], axis=1)

for col in features_2017.columns:
    if col in badfeatures:
        features_2017 = features_2017.drop([col], axis=1)

for col in features_2016.columns:
    if col in category_list:
        utility.float_to_categorical(features_2016, col)

for col in features_2017.columns:
    if col in category_list:
        utility.float_to_categorical(features_2017, col)

In [None]:
"""
    Helper method that prepares 2016 and 2017 properties features for inference
"""
def transform_test_features(features_2016, features_2017):  
    test_features_2016=features_2016
    test_features_2017=features_2017
      
    test_features_2016['year'] = 0
    test_features_2017['year'] = 1
    
    # 11 & 12 lead to unstable results, probably due to the fact that there are few training examples for them
    test_features_2016['month'] = 10
    test_features_2017['month'] = 10
    
    test_features_2016['quarter'] = 4
    test_features_2017['quarter'] = 4
    
    return test_features_2016, test_features_2017

"""
    Helper method that makes predictions on the test set and exports results to csv file
    'models' is a list of models for ensemble prediction (len=1 means using just a single model)
"""
def predict_and_export(models, features_2016, features_2017, file_name):
    # Construct DataFrame for prediction results
    submission_2016 = pd.DataFrame()
    submission_2017 = pd.DataFrame()
    submission_2016['ParcelId'] = features_2016.parcelid
    submission_2017['ParcelId'] = features_2017.parcelid
    
    test_features_2016, test_features_2017 = transform_test_features(features_2016, features_2017)
    
    pred_2016, pred_2017 = [], []
    for i, model in enumerate(models):
        print("Start model {} (2016)".format(i))
        pred_2016.append(model.predict(test_features_2016, predict_disable_shape_check=True))
        print("Start model {} (2017)".format(i))
        pred_2017.append(model.predict(test_features_2017, predict_disable_shape_check=True))
    
    # Take average across all models
    mean_pred_2016 = np.mean(pred_2016, axis=0)
    mean_pred_2017 = np.mean(pred_2017, axis=0)
    
    submission_2016['201610'] = [float(format(x, '.4f')) for x in mean_pred_2016]
    submission_2016['201611'] = submission_2016['201610']
    submission_2016['201612'] = submission_2016['201610']

    submission_2017['201710'] = [float(format(x, '.4f')) for x in mean_pred_2017]
    submission_2017['201711'] = submission_2017['201710']
    submission_2017['201712'] = submission_2017['201710']
    
    submission = submission_2016.merge(how='inner', right=submission_2017, on='ParcelId')
    
    print("Length of submission DataFrame: {}".format(len(submission)))
    print("Submission header:")
    print(submission.head())
    submission.to_csv(file_name, index=False)
    return submission, pred_2016, pred_2017

In [None]:
file_name = 'lgb.csv'
submission, pred_2016, pred_2017 = predict_and_export([model], features_2016, features_2017, file_name)