In [1]:
import pandas as pd
import catboost
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm

In [2]:
train_2016 = pd.read_csv('../../train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train_2017 = pd.read_csv('../../train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
properties_2016 = pd.read_csv('../../properties_2016.csv', low_memory=False)
properties_2017 = pd.read_csv('../../properties_2017.csv', low_memory=False)
test_2016 = pd.read_csv('../../submission.csv', low_memory=False)
test_2017 = pd.read_csv('../../submission.csv', low_memory=False)
# field is named differently in submission
test_2016['parcelid'] = test_2016['ParcelId']
test_2017['parcelid'] = test_2017['ParcelId']

In [3]:
# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df['sin_month'] = np.sin(df['transaction_month'] * np.pi/12)
    df['cos_month'] = np.sin(df['transaction_month'] * np.pi/12)
    df['sin_quarter'] = np.sin(df['transaction_quarter'] * np.pi/4)
    df['cos_quarter'] = np.sin(df['transaction_quarter'] * np.pi/4)
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

def add_other_features(df):
    df['num_missing'] = 0
    for column in df.columns:
        df['num_missing'] += df[column].isnull()
    return df

def add_geographic_features(property2016, property2017):
    property2016['year'] = 2016
    property2017['year'] = 2017
    complete = property2016.append(property2017)
    radian_lat = complete['latitude'] * np.pi/180
    radian_long = complete['longitude'] * np.pi/180
    earth_radius = 3959
    complete['x'] = (-earth_radius * np.cos(radian_lat) * np.sin(radian_long)).fillna(-999)
    complete['y'] = (earth_radius * np.sin(radian_lat)).fillna(-999)
    complete['z'] = (earth_radius * np.cos(radian_lat) * np.sin(radian_long)).fillna(-999)
    pca = PCA(n_components=2)
    rotated_latlong = pca.fit_transform(complete[['latitude', 'longitude']].fillna(-999))
    complete['latlong_pca0'] = rotated_latlong[:,0]
    complete['latlong_pca1'] = rotated_latlong[:,1]
    dropcols = ['year']
    return complete[complete.year == 2016].drop(dropcols, axis=1), complete[complete.year == 2017].drop(dropcols, axis=1)


In [4]:
train_2016 = add_date_features(train_2016)
train_2017 = add_date_features(train_2017)

In [5]:
properties_2016 = add_other_features(properties_2016)
properties_2017 = add_other_features(properties_2017)
properties_2016, properties_2017 = add_geographic_features(properties_2016, properties_2017)

In [6]:
train_2016 = train_2016.merge(properties_2016, how='left', on='parcelid')
train_2017 = train_2017.merge(properties_2017, how='left', on='parcelid')
train_df = train_2016.append(train_2017)
test_2016 = test_2016.merge(properties_2016, how='left', on='parcelid')
test_2017 = test_2017.merge(properties_2017, how='left', on='parcelid')
print("Train: ", train_df.shape)

Train:  (167888, 72)


In [7]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh and 'flag' not in c:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26']
12


In [8]:
# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1 and 'flag' not in c:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid']
7


In [9]:

exclude_other = ['parcelid', 'logerror', 'propertyzoningdesc', 'fireplacecnt'
                 , 'threequarterbathnbr', 'finishedfloor1squarefeet']
include_other = ['hashottuborspa']
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?

train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
train_features.append(include_other)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['transaction_year', 'transaction_month', 'transaction_quarter', 'sin_month', 'cos_month', 'sin_quarter', 'cos_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'fireplaceflag', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock', 'num_missing', 'x', 'y', 'z', 'latlong_pca0', 'latlong_pca1', ['hashottuborspa']]
52


In [10]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features[:-1]):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'cos' in c \
       and not 'sin' in c \
       and not 'number' in c \
       or 'cluster' in c \
       or 'id' in c \
       or 'census' in c \
       or 'code' in c \
       or 'desc' in c:
        cat_feature_inds.append(i)
        if(train_df[c].dtype == np.float64):
            train_df[c] = train_df[c].astype(str)
            test_2016[c] = test_2016[c].astype(str)
            test_2017[c] = test_2017[c].astype(str)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'fireplaceflag', 'assessmentyear', 'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock']


In [11]:
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_2016.fillna(-999, inplace=True)
test_2017.fillna(-999, inplace=True)

MemoryError: 

In [None]:
X_train = train_df[train_features[:-1]]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

In [None]:
from sklearn.utils import shuffle

In [None]:
x_train_shuff, y_train_shuff = shuffle(X_train, y_train, random_state=0)

In [None]:
from catboost import CatBoostRegressor

In [None]:
num_ensembles = 5
models = []
for i in range(num_ensembles):
    # TODO(you): Use CV, tune hyperparameters
    print('Training ' + str(i))
    random = pd.Series([i for i in range(51)])
    random = shuffle(random)
    cols = pd.Series(X_train.columns[random]).sample(20)
    fit_x = X_train[cols]
    new_cats = X_train.columns[cat_feature_inds]
    new_cats_inds = [cols.tolist().index(cat) for cat in new_cats if cat in set(cols)]
    model = CatBoostRegressor(
        depth=6, 
        l2_leaf_reg=3, 
        learning_rate=0.03,
        iterations=200,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        fit_x, y_train,
        cat_features=new_cats_inds)
    models.append(model)