In [43]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from tqdm import tqdm

> ## Data loading, we parse transactiondate

In [44]:
train_2016 = pd.read_csv('../datasets/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train_2017 = pd.read_csv('../datasets/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
properties_2016 = pd.read_csv('../datasets/properties_2016.csv', low_memory=False)
properties_2017 = pd.read_csv('../datasets/properties_2017.csv', low_memory=False)
test_2016 = pd.read_csv('../datasets/sample_submission.csv', low_memory=False)
test_2017 = pd.read_csv('../datasets/sample_submission.csv', low_memory=False)
# field is named differently in submission
test_2016['parcelid'] = test_2016['ParcelId']
test_2017['parcelid'] = test_2017['ParcelId']

In [45]:
# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [46]:
train_2016 = add_date_features(train_2016)
train_2017 = add_date_features(train_2017)
train_2016 = train_2016.merge(properties_2016, how='left', on='parcelid')
train_2017 = train_2017.merge(properties_2017, how='left', on='parcelid')
train_df = train_2016.append(train_2017)
#train_df = train_df[train_df.logerror > -0.4]
#train_df = train_df[train_df.logerror < .42]
test_2016 = test_2016.merge(properties_2016, how='left', on='parcelid')
test_2017 = test_2017.merge(properties_2017, how='left', on='parcelid')
print("Train: ", train_df.shape)

Train:  (167888, 62)


# 0.a) Remove missing data fields

In [47]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']
13


# 0.b) Remove data that is always the same

In [48]:
# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
9


# 1.a) Define training features

In [49]:
exclude_other = ['parcelid', 'logerror']  # for indexing/training only
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?
exclude_other.append('propertyzoningdesc')
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['transaction_year', 'transaction_month', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock']
42


# 1.b) Define which of these training features are categorical

In [50]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear']


# 1.c) Fill missing values

In [51]:
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
#train_df = train_df[train_df.logerror > -0.4]
#train_df = train_df[train_df.logerror < .42]
test_2016.fillna(-999, inplace=True)
test_2017.fillna(-999, inplace=True)

# 2.a) Training time!

In [52]:
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

(164481, 42) (164481,)


In [None]:
num_ensembles = 5
models = []
for i in range(num_ensembles):
    # TODO(you): Use CV, tune hyperparameters
    print('Training ' + str(i))
    model = CatBoostRegressor(
        iterations=200, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)
    models.append(model)
    
def predict_result(test, models, num_ensembles):
    result = 0.0
    for model in models:
        result += model.predict(test)
    result /= num_ensembles
    return result

Training 0
Training 1
Training 2
Training 3
Training 4


# 3.) Create submission

In [None]:
submission = pd.DataFrame({
    'ParcelId': test_2016['parcelid'],
})
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates_2016 = {
    '201610': pd.Timestamp('2016-10-30'),
    '201611': pd.Timestamp('2016-11-30'),
    '201612': pd.Timestamp('2016-12-30')
}
test_dates_2017 = {
    '201710': pd.Timestamp('2017-10-30'),
    '201711': pd.Timestamp('2017-11-30'),
    '201712': pd.Timestamp('2017-12-30')
}

for label, test_date in test_dates_2016.items():
    print("Predicting for: %s ... " % (label))
    # TODO(you): predict for every `test_date`
    test_2016['transactiondate'] = test_date
    test_2016 = add_date_features(test_2016)
    submission[label] = predict_result(test_2016[train_features], models, num_ensembles)

for label, test_date in test_dates_2017.items():
    print("Predicting for: %s ... " % (label))
    # TODO(you): predict for every `test_date`
    test_2017['transactiondate'] = test_date
    test_2017 = add_date_features(test_2017)
    submission[label] = predict_result(test_2017[train_features], models, num_ensembles)

submission_major = 1
submission.to_csv(
    'submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Done! Good luck with submission #%d :)" % submission_major)

Predicting for: 201610 ... 
Predicting for: 201611 ... 


In [None]:
for model in models:
    model.]

In [None]:
cat_feature_inds