# Catboost Method

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

print('Loading Properties ...')
properties2016 = pd.read_csv('properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('properties_2017.csv', low_memory = False)

print('Loading Train ...')
train2016 = pd.read_csv('train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

print('Loading Sample ...')
sample_submission = pd.read_csv('sample_submission.csv', low_memory = False)

print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

#print('Tax Features 2016  ...')
#train2016.iloc[:, train2016.columns.str.startswith('tax')] = np.nan

print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')

del properties2016, properties2017, train2016, train2017
gc.collect();

print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

del num_rows, missing_perc_thresh
gc.collect();

print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))

print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

test_data = {'201610':'2016-10-15',
             '201611':'2016-11-15',
             '201612':'2016-12-15',
             '201710':'2017-10-15',
             '201711':'2017-11-15',
             '201712':'2017-12-15'}
submission = pd.read_csv('sample_submission.csv')

num_ensembles = 5
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(
        iterations=630, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)
    for label,date in test_data.items():
        test_df['transactiondate'] = pd.Timestamp(date) 
        test_df = add_date_features(test_df)
        X_test = test_df[train_features]
        submission[label] += model.predict(X_test)
submission.iloc[:,1:7] /= num_ensembles
del train_df,test_df,X_train,y_train,X_test
gc.collect();

# XGBOOST, OLS and  Mean

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import random

properties = pd.read_csv('properties_2017.csv')
train = pd.read_csv('train_2017.csv')
xgb_sub = pd.read_csv('sample_submission.csv')
##### PROCESS DATA FOR XGBOOST
print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
#x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = pd.merge(xgb_sub[['ParcelId']],properties, how='left', left_on='ParcelId', right_on='parcelid')
#x_test.drop('ParcelId',axis=1)
# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)
x_test = x_test[x_train.columns]

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)

print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )

##### RUN XGBOOST AGAIN
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))

print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)

print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )

##### COMBINE XGBOOST RESULTS

xgb_pred = 0.8*xgb_pred1 + 0.2*xgb_pred2
#xgb_pred = xgb_pred1

print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()

##### OLS
np.random.seed(17)
random.seed(17)

train = pd.read_csv("train_2017.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("properties_2017.csv")
xgb_ols = pd.read_csv("sample_submission.csv")
print(len(train),len(properties))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(xgb_ols, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2017-10-01' #should use the most common training date
test = get_features(test[col])

reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

##### Combine xgb, mean and OLS
pred0 = (1-0.006)*xgb_pred + 0.006*np.mean(y)
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = 0.08*reg.predict(get_features(test)) + (1-0.08)*pred0
    xgb_ols[test_columns[i]] = [float(format(x, '.6f')) for x in pred]
    print('predict...', i)
    
del train,properties,test,y,pred0,pred,xgb_pred
gc.collect()

In [8]:
xgb_ols.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.062555,0.062579,0.062604,0.062555,0.062579,0.062604
1,10759547,0.037543,0.037568,0.037593,0.037543,0.037568,0.037593
2,10843547,-0.065061,-0.065037,-0.065012,-0.065061,-0.065037,-0.065012
3,10859147,-0.010865,-0.01084,-0.010815,-0.010865,-0.01084,-0.010815
4,10879947,0.005878,0.005903,0.005928,0.005878,0.005903,0.005928


# LightGBM Method

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold

print('Loading data...')
properties2016 = pd.read_csv('properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('properties_2017.csv', low_memory = False)
train2016 = pd.read_csv('train_2016_v2.csv')
train2017 = pd.read_csv('train_2017.csv')

sample_submission = pd.read_csv('sample_submission.csv', low_memory = False)
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')
train = pd.concat([train2016, train2017], axis = 0)
test = pd.merge(sample_submission[['ParcelId']], properties2017.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
del properties2016, properties2017, train2016, train2017
gc.collect();


print('Memory usage reduction...')
train[['latitude', 'longitude']] /= 1e6
test[['latitude', 'longitude']] /= 1e6

train['censustractandblock'] /= 1e12
test['censustractandblock'] /= 1e12

for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)
      
        
print('Feature engineering...')
#train['year'] = pd.to_datetime(train['transactiondate']).dt.year
train['month'] = pd.to_datetime(train['transactiondate']).dt.month
train = train.drop('transactiondate', axis = 1)
from sklearn.preprocessing import LabelEncoder
non_number_columns = train.dtypes[train.dtypes == object].index.values

for column in non_number_columns:
    train_test = pd.concat([train[column], test[column]], axis = 0)
    encoder = LabelEncoder().fit(train_test.astype(str))
    train[column] = encoder.transform(train[column].astype(str)).astype(np.int32)
    test[column] = encoder.transform(test[column].astype(str)).astype(np.int32)
    
feature_names = [feature for feature in train.columns[2:] if feature != 'month']

month_avgs = train.groupby('month').agg('mean')['logerror'].values - train['logerror'].mean()
                             
print('Preparing arrays and throwing out outliers...')
X_train = train[feature_names].values
y_train = train['logerror'].values
X_test = test[feature_names].values

del test
gc.collect();

month_values = train['month'].values
month_avg_values = np.array([month_avgs[month - 1] for month in month_values]).reshape(-1, 1)
X_train = np.hstack([X_train, month_avg_values])

X_train = X_train[np.abs(y_train) < 0.4, :]
y_train = y_train[np.abs(y_train) < 0.4]

kfolds = 4

models = []
kfold = KFold(n_splits = kfolds, shuffle = True)
for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
    
    print('Training LGBM model with fold {}...'.format(i + 1))
    X_train_, y_train_ = X_train[train_index], y_train[train_index]
    X_valid_, y_valid_ = X_train[test_index], y_train[test_index]
    
    ltrain = lgb.Dataset(X_train_, label = y_train_, free_raw_data = False)
    lvalid = lgb.Dataset(X_valid_, label = y_valid_, free_raw_data = False)
    
    params = {}
    params['metric'] = 'mae'
    params['max_depth'] = 100
    params['num_leaves'] = 32
    params['feature_fraction'] = .85
    params['bagging_fraction'] = .95
    params['bagging_freq'] = 8
    params['learning_rate'] = 0.01
    params['verbosity'] = 0
    
    models.append(lgb.train(params, ltrain, valid_sets = [ltrain, lvalid], 
            verbose_eval=200, num_boost_round=1000))
                  
                  
print('Making predictions and praying for good results...')
X_test = np.hstack([X_test, np.zeros((X_test.shape[0], 1))])
folds = 10
n = int(X_test.shape[0] / folds)

for j in tqdm(range(folds)):
    results = pd.DataFrame()
    
    if j < folds - 1:
            X_test_ = X_test[j*n: (j+1)*n, :]
            results['ParcelId'] = sample_submission['ParcelId'].iloc[j*n: (j+1)*n]
    else:
            X_test_ = X_test[j*n: , :]
            results['ParcelId'] = sample_submission['ParcelId'].iloc[j*n: ]
            
    for month in [10, 11, 12]:
        X_test_[:, -1] = month_avgs[month - 1]
        assert X_test_.shape[1] == X_test.shape[1]
        y_pred = np.zeros(X_test_.shape[0])
        for model in models:
            y_pred += model.predict(X_test_) / kfolds
        results['2016'+ str(month)] = y_pred
        results['2017'+ str(month)] = y_pred
    
    if j == 0:
        results_ = results.copy()
    else:
        results_ = pd.concat([results_, results], axis = 0)
    
    
print('Saving predictions...')
lgb_results = results_[sample_submission.columns]
assert lgb_results.shape == sample_submission.shape

# combine results which computed by above methods

In [18]:
combine_methods = lgb_results.copy()

In [26]:
combine_methods.iloc[:,1:7] = 0.6*submission + 0.3*lgb_results.iloc[:,1:7] + \
                            0.1*xgb_ols.iloc[:,1:7]

In [27]:
combine_methods.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.019003,0.019449,0.020227,0.01908,0.018554,0.019278
1,10759547,0.00765,0.008339,0.00942,0.008578,0.00829,0.009674
2,10843547,0.016101,0.015559,0.017389,0.015731,0.014207,0.016035
3,10859147,0.023825,0.02304,0.022669,0.023199,0.021748,0.02142
4,10879947,0.000136,0.003164,0.002299,0.002035,0.003977,0.003104


# submit results

In [28]:
combine_methods.to_csv('final_2017.csv',index=False)