# Zillow's Home Value Prediction (Zestimate) #

## Load Packages ##

In [1]:
import numpy as np
import pandas as pd
import scipy

import os
import gc
import time

from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_absolute_error
                                   
np.random.seed(0)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load Data ##

In [2]:
data_dir = "/home/lee/Documents/Datasets for GitHub/kaggle_zillow_home_value_prediction/"

In [3]:
X_no_hash_labeled_train = pd.read_pickle(data_dir+'X_no_hash_labeled_train.pkl')
X_no_hash_labeled_val = pd.read_pickle(data_dir+'X_no_hash_labeled_val.pkl')
y_labeled_train = pd.read_pickle(data_dir+'y_labeled_train.pkl')
y_labeled_val = pd.read_pickle(data_dir+'y_labeled_val.pkl')

In [4]:
columns_all_rm_miss = tuple(X_no_hash_labeled_train)

flag_features_set = set(('fireplaceflag', 'hashottuborspa', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', \
                         'taxdelinquencyflag'))

categorical_features_set = set(('airconditioningtypeid', 'architecturalstyletypeid', \
                                'buildingclasstypeid', 'decktypeid', 'fips', 'heatingorsystemtypeid', \
                                'propertycountylandusecode', 'propertylandusetypeid', \
                                'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock', \
                                'regionidcounty', 'regionidcity', 'regionidzip', \
                                'regionidneighborhood', 'typeconstructiontypeid', 'assessmentyear', \
                                'taxdelinquencyyear', 'transaction_year', 'transaction_month'))

categorical_features_index_rm_miss = list(icol for icol, col in enumerate(columns_all_rm_miss) \
                                          if (col in categorical_features_set) == True)

categorical_features_names_rm_miss = tuple(col for col in columns_all_rm_miss \
                                           if (col in categorical_features_set) == True)

# del flag_features_set, continuous_features_set, categorical_features_set 

In [5]:
# initialize Pool
train_pool = Pool(X_no_hash_labeled_train, label=y_labeled_train, cat_features=categorical_features_index_rm_miss)
val_pool = Pool(X_no_hash_labeled_val, cat_features=categorical_features_index_rm_miss)

In [6]:
reg_catboost = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', verbose=False)

In [7]:
# train the model
reg_catboost.fit(train_pool)

In [8]:
# make the prediction using the resulting model
print("MAE in training: {:.8f}".format(mean_absolute_error(y_labeled_train, reg_catboost.predict(train_pool))))
print("MAE in validation: {:.8f}".format(mean_absolute_error(y_labeled_val, reg_catboost.predict(val_pool))))

MAE in training: 0.06544938
MAE in validation: 0.06836046


In [9]:
# specify parameters and distributions to sample from
param_dist = {"learning_rate": [0.01, 0.03, 0.1],
              "depth": [3, 6, 9],
              "l2_leaf_reg": [1, 4, 9]
             }

# run search
reg_catboost_cv = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', verbose=False)
random_search = GridSearchCV(reg_catboost_cv, param_dist, scoring="neg_mean_absolute_error", cv = 3)

random_search.fit(X_no_hash_labeled_train, y_labeled_train, cat_features=categorical_features_index_rm_miss)

print("Best hyperparameters are: {}".format(random_search.best_params_))

Best hyperparameters are: {'depth': 6, 'l2_leaf_reg': 4, 'learning_rate': 0.01}


In [10]:
reg_catboost_cv = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', verbose=False, **random_search.best_params_)
reg_catboost_cv.fit(train_pool)
# make the prediction using the resulting model
print("MAE in training: {:.8f}".format(mean_absolute_error(y_labeled_train, reg_catboost_cv.predict(train_pool))))
print("MAE in validation: {:.8f}".format(mean_absolute_error(y_labeled_val, reg_catboost_cv.predict(val_pool))))

MAE in training: 0.06583512
MAE in validation: 0.06818038


## Make Submission File ##

In [12]:
sample_submission = pd.read_csv(data_dir+'sample_submission.csv', header=0, index_col=0)

  mask |= (ar1 == a)


In [13]:
def make_submission_prediction(year, month, reg_name, x_test):
    y_pred = pd.DataFrame(reg_name.predict(x_test), index=sample_submission.index, columns=[str(year)+str(month)])
    return y_pred

In [15]:
for year in (2016, 2017):
    for month in (10, 11, 12):
        orig = pd.read_pickle(data_dir+eval("'X_no_hash_test_" + str(year) + str(month) + ".pkl'"))
        # above evaluates to:
        # orig = pd.read_pickle(data_dir+'X_no_hash_test_201610".pkl')

        # initialize Pool
        orig[list(categorical_features_names_rm_miss)] = orig[list(categorical_features_names_rm_miss)].astype(str)

        test_pool = Pool(orig, cat_features=categorical_features_index_rm_miss)

        exec("y_pred" + str(year) + str(month) + " = make_submission_prediction(" + str(year) +", " + str(month) \
             + ", reg_catboost_cv, test_pool" + ")")
        # above evaluates to:
        # y_pred201610 = make_submission_prediction(2016, 10, reg_catboost_cv, test_pool)
        
        exec("sample_submission.update(y_pred" + str(year) + str(month) + ")")

        pred = eval("y_pred" + str(year) + str(month))
        
        del orig, test_pool, pred
        
        gc.collect()

In [17]:
sample_submission.to_csv(data_dir+'CatBoost_no_hash_submit_to_kaggle.csv', float_format='%.4f')