In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt


In [None]:
# Import Data
train = pd.read_csv('../data/datathon_propattributes.csv')
train = train[train['IsTraining'] == 1]

In [43]:
dropped_cols = ['irregular_lot_flg','prop_house_number_suffix','apn',
                'prop_house_number',
                'prop_house_number_2'  ,
                'prop_house_number_suffix' ,
                'prop_direction_left'  ,
                'prop_street_name'  ,     
                'prop_suffix'  ,
                'prop_direction_right' ,
                'prop_unit_type',
                'prop_unit_number' ,
                'prop_city' ,
                'prop_state',
                'prop_zip_code' ,
                'prop_zip_plus_4',
                'zoning',
               'census_tract',
                'mobile_home_ind']

dwelling_type_top_25 = ['Single Family Residential', 'Condominium (Residential)', 'Row house (Residential)', 'Residential-Vacant Land', 'Duplex (2 units, any combination)', 'Triplex (3 units, any combination)', 'Townhouse (Residential)', 'Apartments (generic)', 'Residential (General) (Single)', 'Vacant Land (General)', 'Commercial/Office/Residential Mixed Use', 'Mobile home', 'Multi-Family Dwellings (Generic, 2+)', 'Rural Residence (Agricultural)', 'Commercial (General)', 'Unusable Land (Remnant, Steep, etc.)', 'Retail Stores (Personal Services, Photography, Travel)', 'Commercial-Vacant Land', 'Exempt (full or partial)', 'Office Bldg (General)', 'Agricultural-Unimproved Vacant Land', 'Agricultural / Rural', 'Misc. Structures - Ranch, Farm, Fixtures', 'Warehouse (Industrial)', 'Commercial Building', 'Seasonal, Cabin, Vacation Residence', 'Condominium Offices']
for dwell_type in dwelling_type_top_25:
    train[dwell_type] = 0
    train.loc[train.dwelling_type == dwell_type, dwell_type] = 1

#train.drop(['dwelling_type'],axis = 1, inplace = True)
    
    
 

In [24]:
# which columns contain NaN's
print(train.isna().any())

fips_cd                       False
apn                           False
IsTraining                    False
prop_house_number              True
prop_house_number_2            True
prop_house_number_suffix       True
prop_direction_left            True
prop_street_name               True
prop_suffix                    True
prop_direction_right           True
prop_unit_type                 True
prop_unit_number               True
prop_city                      True
prop_state                    False
prop_zip_code                  True
prop_zip_plus_4                True
dwelling_type                  True
zoning                         True
census_tract                   True
mobile_home_ind                True
timeshare_ind                  True
acres                         False
land_square_footage           False
irregular_lot_flg              True
assessed_total_value          False
assessed_land_value           False
assessed_improvement_value    False
market_total_value          

In [20]:

# COLUMNS TO DROP
dropped_cols = ['irregular_lot_flg','prop_house_number_suffix','apn']
train.drop(dropped_cols, axis = 1, inplace = True)
properties = train.columns

float_columns = properties[train.dtypes == 'float64']
int_columns = properties[train.dtypes == 'int64']

numeric_train = train[float_columns.extend(int_columns)]

KeyboardInterrupt: 

In [None]:
# Clean Data
set(numeric_train['prop_house_number_suffix'])

In [None]:
# Model Training

# Parameters
XGB_WEIGHT = 0.6000
BASELINE_WEIGHT = 0.0000
OLS_WEIGHT = 0.0600

XGB1_WEIGHT = 0.8000  # Weight of first in combination of two XGB models

BASELINE_PRED = 0.0115   # Baseline based on mean of training data, per Oleg



##  XGBoost   ##



##### PROCESS DATA FOR XGBOOST

print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape        

# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

##### RUN XGBOOST

print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

xgb_pred1 = model.predict(dtest)




##### RUN XGBOOST AGAIN

# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

xgb_pred2 = model.predict(dtest)


##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()


##    OLS     ##

np.random.seed(17)
random.seed(17)

train = pd.read_csv("../input/train_2016_v2.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2016.csv")
submission = pd.read_csv("../input/sample_submission.csv")
print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)


train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])

reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']




##### COMBINE PREDICTIONS

print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
pred0 = xgb_weight0*xgb_pred + baseline_weight0*BASELINE_PRED + lgb_weight0*p_test

print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )

print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)



In [8]:
# Model Inference
train.dtypes

fips_cd                         int64
apn                            object
IsTraining                      int64
prop_house_number              object
prop_house_number_2            object
prop_house_number_suffix      float64
prop_direction_left            object
prop_street_name               object
prop_suffix                    object
prop_direction_right           object
prop_unit_type                 object
prop_unit_number               object
prop_city                      object
prop_state                     object
prop_zip_code                 float64
prop_zip_plus_4               float64
dwelling_type                  object
zoning                         object
census_tract                  float64
mobile_home_ind                object
timeshare_ind                  object
acres                         float64
land_square_footage             int64
irregular_lot_flg             float64
assessed_total_value          float64
assessed_land_value           float64
assessed_imp

In [10]:
# Validation
properties = train.columns
numeric_columns = properties[train.dtypes == 'float64']

In [14]:
# Visualizations
numeric_columns = numeric_train.columns


Index(['fips_cd', 'IsTraining', 'land_square_footage', 'tax_year',
       'delinquent_tax_year', 'assessed_year', 'building_square_feet',
       'total_living_square_feet', 'total_ground_floor_square_feet',
       'total_basement_square_feet', 'total_garage_parking_square_feet',
       'year_built', 'effective_year_built', 'bedrooms', 'total_rooms',
       'total_baths_calculated', 'fireplace_num'],
      dtype='object')

In [33]:
print(dict(train.dwelling_type.value_counts()).keys())

dict_keys(['Single Family Residential', 'Condominium (Residential)', 'Row house (Residential)', 'Residential-Vacant Land', 'Duplex (2 units, any combination)', 'Triplex (3 units, any combination)', 'Townhouse (Residential)', 'Apartments (generic)', 'Residential (General) (Single)', 'Vacant Land (General)', 'Commercial/Office/Residential Mixed Use', 'Mobile home', 'Multi-Family Dwellings (Generic, 2+)', 'Rural Residence (Agricultural)', 'Commercial (General)', 'Unusable Land (Remnant, Steep, etc.)', 'Retail Stores (Personal Services, Photography, Travel)', 'Commercial-Vacant Land', 'Exempt (full or partial)', 'Office Bldg (General)', 'Agricultural-Unimproved Vacant Land', 'Agricultural / Rural', 'Misc. Structures - Ranch, Farm, Fixtures', 'Warehouse (Industrial)', 'Commercial Building', 'Seasonal, Cabin, Vacation Residence', 'Condominium Offices', 'Miscellaneous (General)', 'Restaurant', 'Auto repair, auto parts, Garage', 'Misc Residential Improvement', 'Residential Common Area (Condo/P

In [75]:
def find_neighbors(train, lat,long,k,n1 = 'geocode_latitude',n2 = 'geocode_longitude'):
    tmp_arr = train[[n1,n2]]-np.array([lat,long])
    temp_ser = np.sum(np.square(tmp_arr),axis = 1)
    indexes = np.argsort(temp_ser)[0:k]
    return train.loc[indexes,:]

def avg_residuals(train,model, target = 'sale_amt'):
    return np.sum(model.predict(train.drop('sale_amt')) - train['sale_amt'])
    
def K_NN_Residuals(model,train,lat,long,k = 5,target = 'sale_amt',n1 = 'geocode_latitude',n2 = 'geocode_longitude'):
    return avg_residuals(find_neighbors(train,lat,long,k,n1,n2),model,target)

    
    

0    0
1    4
Name: col1, dtype: int64
   col1  col2
1     2     2
0     1     3
2     3     5


In [64]:
x = pd.Series([1,2,3,4,5,.1])
k=3
np.argsort(x)[:k]


0    5
1    0
2    1
dtype: int64