# Zillow's Home Value Prediction (Zestimate) #

## Load Packages ##

In [2]:
import numpy as np
import pandas as pd

import gc
import time

import geopy.distance

from sklearn.neighbors import NearestNeighbors

from catboost import Pool, CatBoostRegressor

from sklearn.metrics import mean_absolute_error

np.random.seed(0)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load Raw Data ##

In [3]:
data_dir = "/home/lee/Documents/Datasets for GitHub/kaggle_zillow_home_value_prediction/"

In [4]:
df_no_hash_labeled_clean_train = pd.read_pickle(data_dir+'df_no_hash_labeled_clean_train.pkl')
# df_labeled_raw_train = pd.read_pickle(data_dir+'df_labeled_raw_train.pkl')
# df_labeled_raw_val = pd.read_pickle(data_dir+'df_labeled_raw_val.pkl')

# df_labeled_raw_train['latitude'] = df_labeled_raw_train['latitude'] / 1e6
# df_labeled_raw_train['longitude'] = df_labeled_raw_train['longitude'] / 1e6

# df_labeled_raw_val['latitude'] = df_labeled_raw_val['latitude'] / 1e6
# df_labeled_raw_val['longitude'] = df_labeled_raw_val['longitude'] / 1e6

## Construct Features ##

The price of a house is highly correlated to other houses near it. For each house in the training dataset, we identify $n$ nearest neighbors in terms of GPS coordinates. Then we average their tax values.

There is more than one way to implement this approach. The `geopy` package calculates distance but is computationally inhibitive. 

In [70]:
# def find_closest_houses(cord_input, df, n_neighbors=10):
#     # find the distance between given coordinates and all coordinates in the dataset
#     dist = df.apply(lambda row: geopy.distance.distance(cord_input, (row['latitude'], row['longitude'])).km, axis=1)
    
#     # find the nearest neighbors, remove the house at the given coordinates itself
#     closest_parcelid = dist.nsmallest(n_neighbors)
#     closest_parcelid = closest_parcelid[closest_parcelid > 0].copy()
    
#     # the average log error of these nearest neighbors
#     avg_log_error = df.loc[closest_parcelid.index, 'logerror'].mean()
    
#     gc.collect()
    
#     return avg_log_error

# start = time.time()
# candidate_prices = df_labeled_raw_train[['logerror', 'latitude', 'longitude']].dropna().copy()
# candidate_prices['closest_neighbors_logerror'] = candidate_prices.apply(lambda row: \
#                                                    find_closest_houses((row['latitude'], row['longitude']), \
#                                                                        candidate_prices, \
#                                                                        n_neighbors=20), axis=1)
# print("Took {:.8f} s".format(time.time() - start))

In [5]:
df_no_hash_labeled_clean_train.sort_values('transactiondate', inplace=True)
num_neighbors = 3
train_dates = df_no_hash_labeled_clean_train.transactiondate.unique()[1:]
# creating some columns that we will be populating

# df_labeled_raw_train['avg_nn_logerror'] = np.nan
df_no_hash_labeled_clean_train['avg_nn_taxvaluedollarcnt'] = np.nan
df_no_hash_labeled_clean_train['avg_nn_structuretaxvaluedollarcnt'] = np.nan

### Find Nearest Neighbors ###

In [6]:
train_int_id = df_no_hash_labeled_clean_train[df_no_hash_labeled_clean_train['latitude'].notnull() & df_no_hash_labeled_clean_train['longitude'].notnull()].reset_index()

for d in train_dates:
    previous_transactions = train_int_id.loc[train_int_id.transactiondate < d, ['longitude','latitude']]
    current_date_transactions = train_int_id.loc[train_int_id.transactiondate == d, ['longitude','latitude']]
    if previous_transactions.shape[0] >= num_neighbors: 
        nbrs = NearestNeighbors(n_neighbors=num_neighbors, algorithm='ball_tree')\
                .fit(previous_transactions)
        distances, indices = nbrs.kneighbors(current_date_transactions)
        indices = indices.reshape(-1, 1)

#         nearest_logerrors = train_int_id.loc[indices.squeeze(), 'logerror']
#         nearest_logerrors = nearest_logerrors.values.reshape(-1, num_neighbors)

        nearest_taxvaluedollarcnt = train_int_id.loc[indices.squeeze(), 'taxvaluedollarcnt']
        nearest_taxvaluedollarcnt = nearest_taxvaluedollarcnt.values.reshape(-1, num_neighbors)

        nearest_structuretaxvaluedollarcnt = train_int_id.loc[indices.squeeze(), 'structuretaxvaluedollarcnt']
        nearest_structuretaxvaluedollarcnt = nearest_structuretaxvaluedollarcnt.values.reshape(-1, num_neighbors)

#         train_int_id.loc[train_int_id.transactiondate==d, ['avg_nn_logerror']] = nearest_logerrors.mean(axis=1)
        train_int_id.loc[train_int_id.transactiondate == d, ['avg_nn_taxvaluedollarcnt']] = nearest_taxvaluedollarcnt.mean(axis=1)
        train_int_id.loc[train_int_id.transactiondate == d, ['avg_nn_structuretaxvaluedollarcnt']] = nearest_structuretaxvaluedollarcnt.mean(axis=1)
    else:
        pass

In [7]:
train_int_id.set_index('parcelid', inplace=True)

In [9]:
train_int_id_ms = df_no_hash_labeled_clean_train[df_no_hash_labeled_clean_train['latitude'].isnull() | df_no_hash_labeled_clean_train['longitude'].isnull()]

In [10]:
df_labeled_train = pd.concat([train_int_id, train_int_id_ms])
del train_int_id, train_int_id_ms

In [12]:
columns_all_rm_miss = tuple(df_labeled_train.drop(['logerror', 'transactiondate'], axis=1))

flag_features_set = set(('fireplaceflag', 'hashottuborspa', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', \
                         'taxdelinquencyflag'))

categorical_features_set = set(('airconditioningtypeid', 'architecturalstyletypeid', \
                                'buildingclasstypeid', 'decktypeid', 'fips', 'heatingorsystemtypeid', \
                                'propertycountylandusecode', 'propertylandusetypeid', \
                                'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock', \
                                'regionidcounty', 'regionidcity', 'regionidzip', \
                                'regionidneighborhood', 'typeconstructiontypeid', 'assessmentyear', \
                                'taxdelinquencyyear', 'transaction_year', 'transaction_month'))

categorical_features_index_rm_miss = list(icol for icol, col in enumerate(columns_all_rm_miss) \
                                          if (col in categorical_features_set) == True)

categorical_features_names_rm_miss = tuple(col for col in columns_all_rm_miss \
                                           if (col in categorical_features_set) == True)

# del flag_features_set, continuous_features_set, categorical_features_set 

## Train Model ##

In [13]:
y_labeled_train = pd.read_pickle(data_dir+'y_labeled_train.pkl')
X_no_hash_labeled_train = df_labeled_train.drop(['logerror', 'transactiondate'], axis=1)

In [15]:
X_no_hash_labeled_train[list(categorical_features_names_rm_miss)] = X_no_hash_labeled_train[list(categorical_features_names_rm_miss)]\
                                                            .astype(str)
# initialize Pool
train_pool = Pool(X_no_hash_labeled_train, label=y_labeled_train, cat_features=categorical_features_index_rm_miss)

In [16]:
reg_catboost = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', verbose=False)

In [17]:
# train the model
reg_catboost.fit(train_pool)

In [20]:
# make the prediction using the resulting model
print("MAE in training: {:.8f}".format(mean_absolute_error(y_labeled_train, reg_catboost.predict(train_pool))))

MAE in training: 0.06816847


The new features do not improve training MAE. Go back to the model without these features. 