In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Input, Dense, Activation
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import keras

import pandas
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise
from keras.optimizers import Adam


import pickle
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
def label_smoother(aerial, gsv):
    mapping = {
        (0, 0): 0,
        (1, 0): 2,
        (2, 0): 0,
        (0, 1): 2,
        (1, 1): 1,
        (2, 1): 1,
        (0, 2): 0,
        (1, 2): 1,
        (2, 2): 2
    }
    return mapping[(int(aerial), int(gsv))]

def clean_labels_redfin(labels):
    labels = labels[['has_parking','MBL']]
    return labels

def clean_labels_hand(labels):
    # smooth labels
    # labels = labels[~((labels['AERIAL_Driveway'] == 2) & (labels['GSV_Driveway'] == 2))]
    labels['final_label']= labels.apply(lambda x: label_smoother(x['AERIAL_Driveway'],x['GSV_Driveway']), axis = 1)
    labels = labels[['final_label', 'MBL']]
    return labels

In [None]:
labels_redfin_path = '../data/redfin_clean.csv'
labels_hand_path = '../data/training/all_labels.csv'
#labels_hand_path_2 = '../data/additional_training_labels_120319.csv'
garage_path = '../data/garage.csv'
tabular_path = '../data/residence_addresses_googlestreetview_clean.csv'

# import dataframe with filenames and labels
labels_redfin = pd.read_csv(labels_redfin_path)
labels_redfin = labels_redfin.fillna(2)

In [None]:
# import data by hand
labels_hand = pd.read_csv(labels_hand_path, index_col = 0)[['MBL','AERIAL_Driveway', 'GSV_Driveway']]
#labels_hand_2 = pd.read_csv(labels_hand_path_2)[['MBL','AERIAL_Driveway', 'GSV_Driveway']]
#labels_hand = pd.concat([labels_hand, labels_hand_2], axis = 0)
labels_hand = labels_hand.fillna(2)

In [None]:
labels_hand

In [None]:
labels_hand.columns

In [None]:
# load garages
garages = pd.read_csv(garage_path, index_col=0)

# load tabular data
tabular = pd.read_csv(tabular_path, index_col = 0)

In [None]:
labels_redfin_clean = clean_labels_redfin(labels_redfin)
labels_hand_clean = clean_labels_hand(labels_hand)

In [None]:
labels_hand_clean.final_label.value_counts()

In [None]:
label_count = labels_hand.final_label.value_counts()
percent_driveway = label_count[1]/(label_count[1] + label_count[0]) 
percent_driveway

In [None]:
labels_clean = labels_hand_clean.merge(labels_redfin_clean, how = 'outer').merge(garages, how = 'outer')

In [None]:
labels_clean.has_parking = labels_clean.apply(
    lambda row: row.has_parking if not np.isnan(row.has_parking) else 
                row.HAS_GARAGE if not np.isnan(row.HAS_GARAGE) else row.final_label , axis = 1
)
labels_clean = labels_clean.drop(['final_label', 'HAS_GARAGE'], axis = 1)

In [None]:
labels_clean

In [None]:
labels_clean.to_csv('../data/labels_final.csv')

In [None]:
def upsample(df):
    # upsample
    n1 = sum((df.has_parking == 1) | (df.has_parking == 0.9))
    n0 = sum((df.has_parking == 0) | (df.has_parking == 0.1))
    labels0 = df[(df.has_parking == 0) | (df.has_parking == 0.1)]
    labels0_upsample = labels0.sample(n1 - n0, replace = True)
    labels_all_upsampled = pd.concat([df, labels0_upsample])
    return labels_all_upsampled

In [None]:
df = labels_clean.merge(tabular)

In [None]:
df.to_csv('../data/df_training.csv')

# EDA

In [None]:
import seaborn as sns
#scaler = StandardScaler()
#df_viz = scaler.fit_transform(df.drop('MBL', axis = 1))
df_viz = pd.DataFrame(df, columns = df.columns.drop(['MBL']))
df_viz = upsample(df_viz)

In [None]:
for clm in df_viz.columns:
    try:
        sns.distplot(df_viz[df.has_parking == 0][clm], label = 'no driveway', rug=True, hist=False)
        sns.distplot(df_viz[df.has_parking == 1][clm], label = 'driveway', rug=True, hist=False)
        plt.legend()
        plt.title(clm)
        plt.show()
    except:
        pass

In [None]:
for idx, x in enumerate(df_viz.corr()['has_parking']):
    print(df_viz.corr()['has_parking'].index[idx])
    print(x)

# Prep Data

In [None]:
df = df.fillna(df.mean())

In [None]:
df = df[df.has_parking != 2]

df_train, df_test = train_test_split(df, test_size = 0.2)

df_train_up = upsample(df_train)
df_test_up = upsample(df_test)


X_train, y_train = df_train_up[tabular.columns].drop('MBL', axis = 1), df_train_up['has_parking']
X_test, y_test = df_test_up[tabular.columns].drop('MBL', axis = 1), df_test_up['has_parking']

X_train_original, y_train_original = df_train[tabular.columns].drop('MBL', axis = 1), df_train['has_parking']
X_test_original, y_test_original = df_test[tabular.columns].drop('MBL', axis = 1), df_test['has_parking']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train_original = scaler.transform(X_train_original)
X_test_original = scaler.transform(X_test_original)


now = datetime.now()

with open(f'../models/scaler_{now}.txt', 'wb') as f:
    pickle.dump(scaler, f)

# poly = PolynomialFeatures(degree = 1)
# X_train = poly.fit_transform(X_train)
# X_test = poly.transform(X_test)

y_test = y_test.apply(lambda x: 1 if x > .5 else 0)
y_train = y_train.apply(lambda x: 1 if x > .5 else 0)
y_train_original = y_train_original.apply(lambda x: 1 if x > .5 else 0)
y_test_original = y_test_original.apply(lambda x: 1 if x > .5 else 0)

In [None]:
X_train.shape

In [None]:
y_train.value_counts()

In [None]:
X_test.shape

In [None]:
y_test.value_counts()

# Logistic Regression

In [None]:
lr = LogisticRegression(C = 1e-3, penalty = 'l2', max_iter = 300)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('validation stats on upsampled test set:')
# validate on upsampled
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# validate on original data
print('validation stats on regular test set:')
y_pred_original = lr.predict(X_test_original)
print(confusion_matrix(y_test_original,y_pred_original))
print(classification_report(y_test_original,y_pred_original))

In [None]:
now = datetime.now()

with open(f'../models/logreg_{now}.txt', 'wb') as f:
    pickle.dump(lr, f)

In [None]:
# validate on original certain
# validate on original data
y_pred_original_proba = lr.predict_proba(X_test_original)
y_pred_original = lr.predict(X_test_original)
threshold = .2
is_certain_pred = np.absolute(y_pred_original_proba[:,1] - 0.5) > threshold
print(classification_report(y_test_original[is_certain_pred],y_pred_original[is_certain_pred]))
print(sum(is_certain_pred)/len(is_certain_pred))

In [None]:
import seaborn as sns
sns.distplot(y_pred_original_proba[:,1][y_test_original == 1])
sns.distplot(y_pred_original_proba[:,1][y_test_original == 0])

In [None]:
param_dict = dict(zip(df_test[tabular.columns].drop('MBL', axis = 1).columns, lr.coef_.reshape(-1)))

param_dict

## Logistic Regression w/ Interactions

In [None]:
# select features for interaction with lasso
lr = LogisticRegression(C = 0.006, penalty = 'l1', max_iter = 300, solver = 'liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
# validate on upsampled
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# validate on original data
print('validation stats on regular test set:')
y_pred_original = lr.predict(X_test_original)
print(confusion_matrix(y_test_original,y_pred_original))
print(classification_report(y_test_original,y_pred_original))

interaction_columns = [clm 
                       for idx, clm in enumerate(df_test[tabular.columns].drop('MBL', axis = 1)) 
                       if lr.coef_.reshape(-1)[idx] > 0]

training_clms = df_train_up[tabular.columns].drop('MBL', axis = 1).columns
interaction_clms_idx = [idx for idx, clm in enumerate(training_clms) if clm in interaction_columns]
non_interaction_clms_idx = [idx for idx, clm in enumerate(training_clms) if clm not in interaction_columns]

In [None]:
poly = PolynomialFeatures(degree = 2)
X_train_interact = poly.fit_transform(X_train[:,interaction_clms_idx])
X_test_interact = poly.transform(X_test[:,interaction_clms_idx])
X_test_original_interact = poly.transform(X_test_original[:,interaction_clms_idx])


X_train_interact = np.concatenate([X_train_interact, X_train[:,non_interaction_clms_idx]], axis = 1)
X_test_interact = np.concatenate([X_test_interact, X_test[:,non_interaction_clms_idx]], axis = 1)
X_test_original_interact = np.concatenate([X_test_original_interact, X_test_original[:,non_interaction_clms_idx]], axis = 1)

In [None]:
X_train_interact.shape

In [None]:
lr = LogisticRegression(C = 0.003, penalty = 'l2', max_iter = 300)
lr.fit(X_train_interact, y_train)

# validate on upsampled
y_pred = lr.predict(X_test_interact)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# validate on original data
y_pred_original = lr.predict(X_test_original_interact)
print(confusion_matrix(y_test_original,y_pred_original))
print(classification_report(y_test_original,y_pred_original))

In [None]:
sum(y_test == y_pred)/len(y_test==y_pred)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

y_pred_original = svclassifier.predict(X_test_original)
print(confusion_matrix(y_test_original, y_pred_original))
print(classification_report(y_test_original, y_pred_original))

In [None]:
now = datetime.now()

with open(f'../models/svc_{now}.txt', 'wb') as f:
    pickle.dump(svclassifier, f)

# XGBoost

In [None]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [None]:
try:
    df = df.drop(['MBL','AERIAL_Driveway', 'GSV_Driveway'], axis = 1)
except KeyError:
    pass

df.has_parking = df.has_parking.apply(lambda x: 1 if x > .5 else 0)

train = df
target = 'has_parking'
IDcol = 'MBL'

In [None]:
count = train.has_parking.value_counts()
scale = count[0]/count[1]
scale

In [None]:
def modelfit(alg, dtrain, predictors, cv_folds=5, early_stopping_rounds=300):
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cvresult.shape[0])
    print(cvresult.shape[0])
    return cvresult.shape[0]

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=scale,
 seed=27)

predictors = [x for x in train.columns if x not in [target, IDcol]]
best_n_estimators = modelfit(xgb1, train, predictors)

In [None]:
param_test1 = {
 'max_depth':[2,3,4],
 'min_child_weight':[5,6,7]
}

gsearch1 = (
    GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=best_n_estimators, max_depth=5,
                 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                 objective= 'binary:logistic', nthread=4, scale_pos_weight=scale, seed=27), 
    param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
)
gsearch1.fit(train[predictors],train[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
gsearch1.best_params_

In [None]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=best_n_estimators, 
        max_depth=gsearch1.best_params_['max_depth'],
        min_child_weight=gsearch1.best_params_['min_child_weight'], 
        gamma=0, 
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=scale,
        seed=27), 
    param_grid = param_test2, 
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=5
)
gsearch2.fit(train[predictors],train[target])
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
xgb2 = XGBClassifier(
    learning_rate =0.1, 
    n_estimators=1000, 
    max_depth=gsearch1.best_params_['max_depth'],
    min_child_weight=gsearch1.best_params_['min_child_weight'], 
    gamma=gsearch2.best_params_['gamma'], 
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic', 
    nthread=4, 
    scale_pos_weight=scale,
    seed=27
)
best_n_estimators = modelfit(xgb2, train, predictors)

In [None]:
xgb3 = XGBClassifier(
    learning_rate =0.1, 
    n_estimators=best_n_estimators, 
    max_depth=gsearch1.best_params_['max_depth'],
    min_child_weight=gsearch1.best_params_['min_child_weight'], 
    gamma=gsearch2.best_params_['gamma'], 
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic', 
    nthread=4, 
    scale_pos_weight=scale,
    seed=27
)

In [None]:
# fit model
train, test = train_test_split(df, test_size = 0.2)

In [None]:
xgb3.fit(train[predictors], train[target])
y_pred = xgb3.predict(test[predictors])
sum(y_pred == test[target])/len(test[target])

In [None]:
y_pred = xgb3.predict(test[predictors])
print(classification_report(test[target],y_pred))

# Neural Network

In [None]:
batch_size = 16
epochs = 40
layers = 6

model = Sequential()
model.add(GaussianNoise(0.1, input_shape = (X_train.shape[1], )))
for _ in range(layers):
    model.add(Dense(50, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(BatchNormalization())
    # model.add(Activation('relu'))
    model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# Compile model
adam = Adam(lr = 1e-4)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

## Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'bootstrap' : [True, False],
    'n_estimators' : [16, 64, 256, 1024],
    'max_depth' : [3,4,5,6]
}

rf_up = GridSearchCV(RandomForestClassifier(), params, n_jobs = -1, verbose = 2)

In [None]:
rf_up.fit(X_train, y_train)
y_pred = rf_up.predict(X_test)

print('validation stats on upsampled test set:')
# validate on upsampled
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# validate on original data
print('validation stats on regular test set:')
y_pred_original = rf_up.predict(X_test_original)
print(confusion_matrix(y_test_original,y_pred_original))
print(classification_report(y_test_original,y_pred_original))

In [None]:
sum(y_test==y_pred)/len(y_test==y_pred)

In [None]:
rf_up.best_params_

In [None]:
y_pred = rf_up.predict(X_test)
y_pred_proba = rf_up.predict_proba(X_test)
certain = y_pred_proba[:,1] > 0.6

print('validation stats on upsampled test set:')
# validate on upsampled
print(confusion_matrix(y_test[certain], y_pred[certain]))
print(classification_report(y_test[certain], y_pred[certain]))

In [None]:
with open(f'../models/random_forest_{now}.txt', 'wb') as f:
    pickle.dump(rf_up, f)

In [None]:
plt.hist(y_pred_proba[:,1])

In [None]:
rf_up.predict(X_test[0:,:])

In [None]:
from sklearn.calibration import calibration_curve
res = calibration_curve(y_test, y_pred_proba[:,1], n_bins=10)
print(res)
plt.plot(res[1], res[0])
plt.plot(np.linspace(0,1,20), np.linspace(0,1,20))

## Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

models = [('rf', rf_up),('svc', svclassifier), ('lr',lr)]

stack = StackingClassifier(models)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)

In [None]:
print('validation stats on upsampled test set:')
# validate on upsampled
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# validate on original data
print('validation stats on regular test set:')
y_pred_original = stack.predict(X_test_original)
print(confusion_matrix(y_test_original,y_pred_original))
print(classification_report(y_test_original,y_pred_original))

## Make Predictions

In [None]:
tabular_no_mbl = tabular.drop('MBL', axis =1)
tabular_no_na = tabular_no_mbl.fillna(tabular_no_mbl.mean())
y_prob = rf_up.predict_proba(scaler.transform(tabular_no_na))

In [None]:
predictions_all = pd.DataFrame()
predictions_all['MBL'] = tabular['MBL']
predictions_all['no_driveway'] = y_prob[:,0]
predictions_all['yes_driveway'] = y_prob[:,1]

In [None]:
predictions_all.to_csv('../data/predictions_final.csv')

## Feature Importances

In [None]:
features = df.drop(['MBL','has_parking'], axis = 1).columns

importances = rf_up.best_estimator_.feature_importances_

feature_imp = dict(zip(features, importances))

import operator
sorted(feature_imp.items(), key=operator.itemgetter(1), reverse = True)[:20]
pd.DataFrame(sorted(feature_imp.items(), key=operator.itemgetter(1), reverse = True)[:20]).to_csv('../data/feature_imp.csv')