In [281]:
import pandas as pd
import numpy as np

from sklearn.calibration import calibration_curve

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsRegressor

RANDOM_SEED = 1995
np.random.seed(RANDOM_SEED)

n_driveway = 353
n_no_driveway = 80
n_all = n_driveway + n_no_driveway
prop_driveway_mean = n_no_driveway / n_all
prop_driveway_se = np.sqrt(prop_driveway_mean*(1-prop_driveway_mean)/n_all)
print(f"driveway prop mean: {prop_driveway_mean}, driveway prop sd: {prop_driveway_se}")

driveway prop mean: 0.18475750577367206, driveway prop sd: 0.01865093041001626


In [302]:
def get_p(df_training, df_everything, bootstrap = False):
    X_train, y_train, X_cal, y_cal, X_all = get_data(df_training, df_everything, bootstrap = bootstrap)

    # train random forest
    rf = RandomForestClassifier(bootstrap = False, max_depth=6, n_estimators = 256)
    rf.fit(X_train, y_train)
    
    # validate
    y_pred = rf.predict(X_cal)
    
    # train calibrator
    pred_cal = rf.predict_proba(X_cal)[:,1].reshape(-1, 1)
    calibrator = LogisticRegression(C = 1e30)
    calibrator.fit(pred_cal, y_cal)
    
    calibration_curve(y_cal, pred_cal)
    # calculate values 
    prob_true, prob_pred = calibration_curve(y_true=y_cal, 
        y_prob=calibrator.predict_proba(pred_cal)[:,1],
        n_bins=20)

    # make predictions
    pred_all = rf.predict_proba(X_all)[:,1].reshape(-1, 1)
    probs = calibrator.predict_proba(pred_all)[:,1]
    return probs

def get_data(df_train, df_everything, bootstrap = False):
    df_train, df_cal = train_test_split(df_train, test_size = 0.2)

    # upsample
    df_train = upsample(df_train, 0.5)
    if bootstrap:
        prop_driveway = np.random.normal(prop_driveway_mean, prop_driveway_se)
    else:
        prop_driveway = prop_driveway_mean
    df_cal = upsample(df_cal, prop_driveway)
    
    if bootstrap:
        df_train = df_train.sample(frac = 1, replace = True)
        print(df_train.has_parking.value_counts()[0]/len(df_train))
        df_cal = df_cal.sample(frac = 1, replace = True)
        print(df_cal.has_parking.value_counts()[0]/len(df_cal))

    # prep clms
    X_train = df_train.drop(['MBL', 'has_parking'], axis = 1)
    y_train = df_train['has_parking']
    
    X_cal = df_cal.drop(['MBL', 'has_parking'], axis = 1)
    y_cal = df_cal['has_parking']

    X_all = df_everything.drop('MBL', axis = 1)
    
    # scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_cal = scaler.transform(X_cal)
    X_all = scaler.transform(X_all)

    return X_train, y_train, X_cal, y_cal, X_all


def upsample(df, prop):
    # upsample
    n1 = sum(df.has_parking == 1)
    n0 = sum(df.has_parking == 0)
    extra = prop / (1 - prop) * n1

    labels0 = df[df.has_parking == 0]
    labels0_upsample = labels0.sample(int(extra - n0), replace = True)
    labels_all_upsampled = pd.concat([df, labels0_upsample])
    return labels_all_upsampled

df_everything = pd.read_csv('../data/residence_addresses_googlestreetview_clean.csv', index_col = 0)
df_training = pd.read_csv('../data/df_training.csv', index_col = 0)
df_training = df_training[df_training.has_parking != 2]

df_everything = df_everything.fillna(df_everything.mean())
df_training = df_training.fillna(df_training.mean())

prediction_df = pd.DataFrame()

prediction_df['MBL'] = df_everything['MBL']
prediction_df['p'] = get_p(df_training, df_everything)

In [300]:
bootstrap = 50

for sample_num in range(bootstrap):
    print(sample_num)
    prediction_df['sample_' + str(sample_num)] = get_p(df_training, df_everything, bootstrap = True)

sample_cols = [clm for clm in prediction_df.columns if 'sample' in clm]
prediction_df['variance'] = prediction_df[sample_cols].var(axis = 1)

0
0.4992534129692833
0.1529664045746962
1
0.5002134927412468
0.19116632160110422
2
0.5072572038420491
0.16995768688293372
3
0.5087178396768021
0.175177304964539
4
0.4997873697639804
0.16428571428571428
5
0.5015971039182283
0.18840579710144928
6
0.5002135383301303
0.18561643835616437
7
0.49914657563473436
0.17707618393960192
8
0.49211759693225393
0.21185286103542234
9
0.4945721583652618
0.16147308781869688
10
0.49680715197956576
0.1686746987951807
11
0.49904092071611256
0.1873259052924791
12
0.5008508827908955
0.17272727272727273
13
0.5045783645655877
0.19548872180451127
14
0.5042616663115278
0.1915041782729805
15
0.4968152866242038
0.19916142557651992
16
0.49946604015378043
0.2204515272244356
17
0.509263202725724
0.1376281112737921
18
0.5011724578981027
0.15555555555555556
19
0.5018073570061663
0.19272976680384088
20
0.49755006391137624
0.1886145404663923
21
0.5033041995310168
0.1774193548387097
22
0.5029761904761905
0.21492743607463718
23
0.4980785653287788
0.23755806237558064
24
0.50

In [301]:
np.sqrt(
    (prediction_df[sample_cols] * (1 - prediction_df[sample_cols])).mean(axis = 1).sum() + prediction_df[sample_cols].var(axis = 1).sum()
) * 1.96

73.75457768345348

In [283]:
point_est = prediction_df.p.sum()
se = np.sqrt((prediction_df.p * (1 - prediction_df.p) + prediction_df.variance).sum())

0
0.5010665529010239
0.21002710027100271
1
0.4928556195350821
0.17422096317280453
2
0.5002130379207499
0.17464788732394365
3
0.5014899957428693
0.16341287057122197
4
0.5039428815004262
0.19710544452102
5
0.4981911044903171
0.16678346180798878
6
0.49701619778346123
0.16563997262149213
7
0.5001065643648764
0.16502808988764045
8
0.5007472245943638
0.19820441988950277
9
0.4973358908780904
0.2045769764216366


In [284]:
print(f"# of driveways: {point_est} +/- {1.96 * se}")

# of driveways: 10119.965090707094 +/- 71.34345277062235


In [238]:
features = df_training.drop(['MBL','has_parking'], axis = 1).columns

importances = rf.feature_importances_

feature_imp = dict(zip(features, importances))

import operator
sorted(feature_imp.items(), key=operator.itemgetter(1), reverse = True)[:20]
pd.DataFrame(sorted(feature_imp.items(), key=operator.itemgetter(1), reverse = True)[:20]).to_csv('../data/feature_imp.csv')