In [1]:
import fiona
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import geopandas as gpd
from shapely.geometry import Point, Polygon

import zipfile
import requests
import os
import shutil

from sklearn.preprocessing import *

from lnks import scl_cols

#import tensorflow as tf

%matplotlib inline

import warnings #DANGER: I triggered a ton of warnings.
warnings.filterwarnings('ignore')

np.random.seed()

from statistics import mean, median

# To start...

We import the data output of our data pipeline. We reset the index, drop index columns, and lag the data.

We explicitly print shape several times, making sure that we capture the magnitude of data lost from dropping NA values.

In [2]:
#Shift and shape vars
shiftmonths = 6
shapef = 'ward'
#Assign the split for holdout data.
holdout_date = 2015.5
#Get data
filestring = './data/'+shapef+'_out.csv'
df = pd.read_csv(filestring)
df = df.sort_values(['month', 'NAME'])# , 'ANC'])
df = df.reset_index(drop=True)
len(df.NAME.unique())

8

Now we examine the columns and lag the data.

In [3]:
df.columns

Index(['Unnamed: 0', 'NAME', 'Util_Indx_BBL', 'countBBL', 'countIssued',
       'month', 'SALEPRICE', 'Q_GDP', 'BIZ_Dist_Concentr',
       'GS_GRANTS_Concentr', 'LIQUOR_Concentr', 'PHARM_Concentr',
       'GROC_Concentr', 'BANKS_Concentr', 'CLUBS_Concentr', 'HOTELS_Concentr',
       'METRO_Concentr', 'pct_metro_coverage'],
      dtype='object')

In [4]:
print(df.shape)
shiftnum= (((len(df.NAME.unique()))*(shiftmonths)))

#Also generate some lagged y data in the opposite direction.
df['y']= df['countBBL'].shift(-shiftnum)
df['countBBL_prev_month'] = df['countBBL'].shift((len(df.NAME.unique())))
df['countBBL_prev_cycle'] = df['countBBL'].shift((shiftnum))
df = df[shiftnum:-(shiftnum+(len(df.NAME.unique())))]
df = df.dropna()
df.shape

(480, 18)


(376, 21)

The next cell cleans out vestigial columns and drops/fills/expands to dummies for our NA and categorical values.

In [5]:
df = pd.get_dummies(df, columns=['NAME'])
df = df.drop(['Unnamed: 0'], axis= 1)
print(df.shape)
df = df.astype('float')

df = df.dropna()
print(df.shape)

(376, 27)
(376, 27)


Here we start building our grid search inputs, beginning with the splits.

In [6]:
#Flexible adaptation of Dr. Braman's interactive gridsearch script
#implementation. 
#TODO Clean up and streamline
import sklearn
from sklearn.neural_network import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.gaussian_process import *
from sklearn.gaussian_process.kernels import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
import random

#Frame up some separate DataFrames for scalar and stuff
scl_data = data = df



data     = data.reset_index(drop=True)
X = data.drop(['y'], axis=1)
y = data['y']

XH_train = data[data['month'] <= holdout_date-1]
yH_train = XH_train['y']
XH_train = XH_train.drop(['y'], axis=1)

XH_val = scl_data[scl_data['month'] >= holdout_date-1]
XH_val = XH_val[XH_val['month'] <= holdout_date]

yH_val = XH_val['y']
XH_val = XH_val.drop(['y'], axis=1)

XH_test  = data[data['month'] >= holdout_date]
yH_test  = XH_test['y']
XH_test  = XH_test.drop(['y'], axis=1)

ytr = sklearn.preprocessing.MinMaxScaler([0, 1]
            ).fit(y)
y = ytr.fit_transform(y)
y = pd.DataFrame(y, columns=['y'])

scl_data = scl_data.reset_index(drop=True)

In [7]:
y.y

0      0.063657
1      0.248852
2      0.030844
3      0.028481
4      0.071400
5      0.112744
6      0.015225
7      0.000919
8      0.066675
9      0.249245
10     0.033863
11     0.031631
12     0.072713
13     0.114713
14     0.014569
15     0.000000
16     0.074944
17     0.269589
18     0.038456
19     0.036225
20     0.084001
21     0.123507
22     0.018113
23     0.004988
24     0.076519
25     0.274970
26     0.040557
27     0.038588
28     0.086888
29     0.127445
         ...   
346    0.305814
347    0.245045
348    0.414490
349    0.601391
350    0.219845
351    0.211970
352    0.403334
353    0.926631
354    0.315002
355    0.255939
356    0.432865
357    0.626329
358    0.230214
359    0.216564
360    0.415015
361    0.956031
362    0.325765
363    0.265520
364    0.454784
365    0.651660
366    0.238220
367    0.221026
368    0.435490
369    1.000000
370    0.335215
371    0.270377
372    0.472503
373    0.673579
374    0.243864
375    0.231658
Name: y, Length: 376, dt

In [8]:
print(scl_data.month.max())
print(scl_data.shape)
scl_data = scl_data.dropna()
print(scl_data.shape)
sXH_train = scl_data[scl_data['month'] <= holdout_date-1]
syH_train = sXH_train['y']
sXH_train = sXH_train.drop(['y'], axis=1)

sXH_val = scl_data[scl_data['month'] >= holdout_date-1]
sXH_val = sXH_val[sXH_val['month'] <= holdout_date]

syH_val = sXH_val['y']
sXH_val = sXH_val.drop(['y'], axis=1)


sXH_test  = scl_data[scl_data['month'] >= holdout_date]
syH_test  = sXH_test['y']
sXH_test  = sXH_test.drop(['y'], axis=1)

2016.05
(376, 27)
(376, 27)


In [9]:
#Build scalers for the scl_data, other --------------------
scale_data_splits = [scl_data, sXH_train,sXH_test, syH_train, syH_test]
for scl_data in scale_data_splits:
    scaler = sklearn.preprocessing.StandardScaler(
                ).fit(scl_data)
    minmaxer = sklearn.preprocessing.MinMaxScaler([0, 1]
                ).fit(scl_data)

    scl = scaler.transform(scl_data)
    scl = minmaxer.transform(scl_data)
    try:
        scl_data = pd.DataFrame(scl, columns=scl_data.columns)
    except AttributeError as e:
        print(e)
        scl_data = pd.DataFrame(scl, columns=['y'])
    print(scl_data.shape)
    scl_data = scl_data.dropna()
    print(scl_data.shape)
    assert np.all(np.isfinite(scl_data))
    assert not np.any(np.isnan(scl_data))
    
    
#scl_data[scl_data.columns
#   ] = scaler.fit_transform(scl_data[scl_data.columns])

#----------------------------------------------------------


(376, 27)
(376, 27)
(240, 26)
(240, 26)
(40, 26)
(40, 26)
'Series' object has no attribute 'columns'
(240, 1)
(240, 1)
'Series' object has no attribute 'columns'
(40, 1)
(40, 1)


Let's make sure our data came out of the scalers intact:

In [10]:
y;

In [11]:
print(sXH_train.shape)
print(syH_train.shape)
print(sXH_test.shape)
print(syH_test.shape)


(240, 26)
(240,)
(40, 26)
(40,)


In [12]:
scl_data.columns

Index(['y'], dtype='object')

In [13]:
sX = scl_data.drop(['y'], axis=1)
sy = scl_data['y']



assert np.all(np.isfinite(X))
assert np.all(np.isfinite(y))
assert not np.any(np.isnan(X))
assert not np.any(np.isnan(y))

assert np.all(np.isfinite(sX))
assert np.all(np.isfinite(sy))
assert not np.any(np.isnan(sX))
assert not np.any(np.isnan(sy))

In [14]:
scl_data.columns

Index(['y'], dtype='object')

In [15]:
scl_data.describe()

Unnamed: 0,y
count,40.0
mean,0.281498
std,0.288366
min,0.0
25%,0.058288
50%,0.180698
75%,0.370207
max,1.0


This cell contains our a crude RNG, a list of regressors which benefit from scaled data, and hardcoded data used to generate our param_grid, et cetera.

In [16]:
#Make a short list of random states to insert into randomstate params.
scrambler = []
for scram in range(0, 10):
    scrambler.append(random.randint(0, 10000))   
print(scrambler)

to_scale = ['SVR']

names       = ['AdaBoostRegressor',
             'RandomForestRegressor',
             'SVR',
             #'KNeighborsRegressor',
             #'BaggingRegressor',
             'GradientBoostingRegressor',
             #'LinearRegression',
             #'MLPRegressor',
             #'SGDRegressor',
             'LassoLars'         
    
]

regressors = [AdaBoostRegressor(),
              RandomForestRegressor(),
              SVR(),
              #KNeighborsRegressor(),
              #BaggingRegressor(),
              GradientBoostingRegressor(),
              #LinearRegression(),
              #MLPRegressor(),
              #SGDRegressor(),
              LassoLars()
    
]

param_grids =[ 
    ['AdaBoostRegressor', dict(
        n_estimators=[80, 60, 30],
        learning_rate=[1, .5, .01],
        loss=['linear', 'square', 'exponential'],
        #random_state=scrambler[3:5]
        
    )],
        
    ['RandomForestRegressor', dict(
        max_depth=[5, 10, 15],
        criterion=['mse', 'mae'],
        #random_state=scrambler[:2]
    )],
    ['SVR', dict( #Most params for SVR are turned off right now, too expensive
        C=[1, .9],
        epsilon=[.1, .05],
        #kernel=['poly']
    )],
    ['GradientBoostingRegressor', dict(
        max_depth=[3, 6, 9, 12],
        min_samples_split=[2, 4, 8],
        presort=[False]
    )],
    ['LassoLars', dict(
        alpha=[0.1, 1, .5, .75],
        #random_state=[random.randint(0, 10000)]
    )],
    ]

[2014, 8342, 4193, 8347, 7523, 3262, 7751, 1117, 1803, 1418]


## Grid Search:

Here we implement an iterator that executes GridSearchCV and reports the best explained variance. The best_params attribute is then extracted, and used those on the whole training set, then predict on the holdout data.

Testing indicates that for some models, the fit on our full dataset modestly outperforms the CV regularly.

In [17]:
outcomes = []

for name, rgsr in zip(names, regressors):
    
    for item in param_grids:
        if item[0]==name:
            print(name + ':')
            params= item[1]
        
    
    cv = sklearn.model_selection.GridSearchCV(rgsr, param_grid=params,
                                              verbose=True, n_jobs=12,
                                              cv=3, pre_dispatch="2*n_jobs")
    
    if name not in to_scale:
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(X, y)
        fitted = cv.fit(XH_train, yH_train)
        score = cv.score(XH_val, yH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(XH_train, yH_train)
        bestscore = best.score(XH_test, yH_test)
    if name in to_scale:
    #TODO: fix
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(sX, sy)
        fitted = cv.fit(sXH_train, syH_train)
        score = cv.score(sXH_val, syH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(sXH_train, syH_train)
        bestscore = best.score(sXH_test, syH_test)

    print(name + " R2 with best model, score:")
    print(bestscore)
    
    outcomes.append((name, score, cv.cv_results_, cv.best_estimator_, 
                     cv.best_params_, bestscore, [yH_test, ]))
    
for nm in range(0, len(outcomes)):
    print()
    print(outcomes[nm][0])
    print(outcomes[nm][1])

    print()
    print('Best on real:')
    print(outcomes[nm][-1])
    
    

AdaBoostRegressor:
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=12)]: Done  81 out of  81 | elapsed:    5.3s finished


0.486307851489
AdaBoostRegressor R2 with best model, score:
-0.295456403069
RandomForestRegressor:
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=12)]: Done  14 out of  18 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  18 out of  18 | elapsed:    0.5s finished


0.508325357028
RandomForestRegressor R2 with best model, score:
-0.215484880595
SVR:
Fitting 3 folds for each of 4 candidates, totalling 12 fits
-0.72228423932
SVR R2 with best model, score:
-2.17424504056
GradientBoostingRegressor:
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Done  36 out of  36 | elapsed:    2.4s finished


0.491382920394
GradientBoostingRegressor R2 with best model, score:
-0.250987467867
LassoLars:
Fitting 3 folds for each of 4 candidates, totalling 12 fits
0.538089107605
LassoLars R2 with best model, score:
0.122974310395

AdaBoostRegressor
0.486307851489

Best on real:
[336    3226.0
337    6898.0
338    2675.0
339    2299.0
340    3463.0
341    4821.0
342    2121.0
343    2075.0
344    3513.0
345    7401.0
346    2915.0
347    2452.0
348    3743.0
349    5167.0
350    2260.0
351    2200.0
352    3658.0
353    7645.0
354    2985.0
355    2535.0
356    3883.0
357    5357.0
358    2339.0
359    2235.0
360    3747.0
361    7869.0
362    3067.0
363    2608.0
364    4050.0
365    5550.0
366    2400.0
367    2269.0
368    3903.0
369    8204.0
370    3139.0
371    2645.0
372    4185.0
373    5717.0
374    2443.0
375    2350.0
Name: y, dtype: float64]

RandomForestRegressor
0.508325357028

Best on real:
[336    3226.0
337    6898.0
338    2675.0
339    2299.0
340    3463.0
341    4821.0
342  

[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.1s finished


# Data Analysis

In [18]:
data.corr()['y'].sort_values()

NAME_Ward 8           -0.258725
NAME_Ward 7           -0.230429
NAME_Ward 4           -0.177442
NAME_Ward 3           -0.140420
Q_GDP                 -0.116460
GS_GRANTS_Concentr    -0.047515
NAME_Ward 1           -0.018463
NAME_Ward 5            0.019506
SALEPRICE              0.196920
NAME_Ward 6            0.197231
METRO_Concentr         0.425276
CLUBS_Concentr         0.425276
HOTELS_Concentr        0.425276
GROC_Concentr          0.425276
PHARM_Concentr         0.425276
LIQUOR_Concentr        0.425276
BANKS_Concentr         0.425276
month                  0.525271
NAME_Ward 2            0.608741
BIZ_Dist_Concentr      0.628703
pct_metro_coverage     0.642598
countIssued            0.796433
countBBL_prev_cycle    0.842428
Util_Indx_BBL          0.888756
countBBL_prev_month    0.926182
countBBL               0.945369
y                      1.000000
Name: y, dtype: float64

In [19]:
#LassoLARS
blist = outcomes[-1] #this number is how we select which regressor
print(blist[0])
prms = LassoLars(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.coef_path_) #Or whatever other attribute you want

LassoLars
LassoLars(alpha=0.5, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

Score on test data:
0.122974310395

[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   8.98255317e+03   9.55340064e+03   9.59248352e+03
    9.61018678e+03   9.39332633e+03   9.38695867e+03   9.05025704e+03
    8.68169153e+03   8.31590377e+03   8.13534781e+03   7.27032015e+03
    7.25493774e+03   6.81377578e+03   6.76649735e+03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   5.58042089e+01
    5.67225641e+01   7.78481

In [20]:
lars_anc_beta = [i for i in zip(XH_test.columns, prms.coef_)]
sorted(lars_anc_beta, key=lambda x: x[1])

[('NAME_Ward 8', -82.220916654381668),
 ('NAME_Ward 7', -48.742254145036426),
 ('countBBL_prev_month', -0.056519416802164407),
 ('Util_Indx_BBL', 0.0),
 ('month', 0.0),
 ('SALEPRICE', 0.0),
 ('GS_GRANTS_Concentr', 0.0),
 ('PHARM_Concentr', 0.0),
 ('GROC_Concentr', 0.0),
 ('BANKS_Concentr', 0.0),
 ('CLUBS_Concentr', 0.0),
 ('HOTELS_Concentr', 0.0),
 ('METRO_Concentr', 0.0),
 ('NAME_Ward 1', 0.0),
 ('NAME_Ward 3', 0.0),
 ('NAME_Ward 4', 0.0),
 ('NAME_Ward 6', 0.0),
 ('LIQUOR_Concentr', 0.0016118683441137462),
 ('countIssued', 0.095442423209929184),
 ('countBBL_prev_cycle', 0.19412765993310804),
 ('countBBL', 0.62462243476349466),
 ('Q_GDP', 0.74679070569026995),
 ('NAME_Ward 5', 117.28209292226212),
 ('BIZ_Dist_Concentr', 166.96512972178331),
 ('NAME_Ward 2', 219.99307488072657),
 ('pct_metro_coverage', 513.87031014370427)]

In [21]:
#AdaBoost
blist = outcomes[0] #this number is how we select which regressor
print(blist[0])
prms = AdaBoostRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

AdaBoostRegressor
AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='linear',
         n_estimators=80, random_state=None)

Score on test data:
-0.33277575513

[  1.58521534e-02   2.88537509e-01   0.00000000e+00   2.50997386e-03
   0.00000000e+00   6.54527608e-04   1.78379663e-02   1.48029270e-02
   1.21784507e-03   1.40171781e-03   1.25656757e-03   4.52792978e-04
   1.15220482e-03   3.05801514e-03   6.16424452e-04   5.84291274e-02
   2.31394171e-01   3.29957825e-01   0.00000000e+00   3.08011790e-02
   0.00000000e+00   0.00000000e+00   0.00000000e+00   6.70734117e-05
   0.00000000e+00   0.00000000e+00]


In [22]:
ada_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(ada_anc_beta, key=lambda x: x[1])

[('countIssued', 0.0),
 ('SALEPRICE', 0.0),
 ('NAME_Ward 1', 0.0),
 ('NAME_Ward 3', 0.0),
 ('NAME_Ward 4', 0.0),
 ('NAME_Ward 5', 0.0),
 ('NAME_Ward 7', 0.0),
 ('NAME_Ward 8', 0.0),
 ('NAME_Ward 6', 6.7073411730050037e-05),
 ('BANKS_Concentr', 0.00045279297825894584),
 ('METRO_Concentr', 0.00061642445159790162),
 ('Q_GDP', 0.00065452760806307777),
 ('CLUBS_Concentr', 0.0011522048181082328),
 ('LIQUOR_Concentr', 0.0012178450692085525),
 ('GROC_Concentr', 0.0012565675675873503),
 ('PHARM_Concentr', 0.001401717811059645),
 ('month', 0.0025099738636884805),
 ('HOTELS_Concentr', 0.0030580151357639538),
 ('GS_GRANTS_Concentr', 0.01480292704292104),
 ('Util_Indx_BBL', 0.01585215338271995),
 ('BIZ_Dist_Concentr', 0.017837966293794985),
 ('NAME_Ward 2', 0.030801178969731165),
 ('pct_metro_coverage', 0.058429127385812088),
 ('countBBL_prev_month', 0.23139417063277445),
 ('countBBL', 0.28853750893792623),
 ('countBBL_prev_cycle', 0.32995782463925405)]

In [23]:
#RFR 
blist = outcomes[1] #this number is how we select which regressor
print(blist[0])
prms = RandomForestRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

RandomForestRegressor
RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Score on test data:
-0.27265920692

[  2.26665212e-02   4.10950210e-01   8.15831833e-03   1.90599737e-02
   9.94544328e-03   5.63567307e-03   5.76386149e-03   2.23237638e-03
   1.95875361e-03   1.10148075e-03   1.17643053e-03   1.58293458e-03
   9.03760236e-04   1.90727575e-03   9.00697880e-04   5.28942180e-02
   1.86316185e-01   2.63247383e-01   1.32098250e-04   0.00000000e+00
   1.54889952e-03   6.66141351e-04   3.08934118e-04   5.08233505e-04
   2.11481969e-04   2.22714155e-04]


In [24]:
rfr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(rfr_anc_beta, key=lambda x: x[1])

[('NAME_Ward 2', 0.0),
 ('NAME_Ward 1', 0.00013209824971416816),
 ('NAME_Ward 7', 0.00021148196926529836),
 ('NAME_Ward 8', 0.0002227141553536217),
 ('NAME_Ward 5', 0.00030893411793786533),
 ('NAME_Ward 6', 0.00050823350540113053),
 ('NAME_Ward 4', 0.00066614135128426847),
 ('METRO_Concentr', 0.00090069788003957539),
 ('CLUBS_Concentr', 0.00090376023616838658),
 ('PHARM_Concentr', 0.0011014807473442429),
 ('GROC_Concentr', 0.0011764305304188191),
 ('NAME_Ward 3', 0.0015488995219074704),
 ('BANKS_Concentr', 0.0015829345830377383),
 ('HOTELS_Concentr', 0.0019072757509339343),
 ('LIQUOR_Concentr', 0.0019587536087629625),
 ('GS_GRANTS_Concentr', 0.002232376380754309),
 ('Q_GDP', 0.0056356730719543148),
 ('BIZ_Dist_Concentr', 0.0057638614896868027),
 ('countIssued', 0.0081583183315222119),
 ('SALEPRICE', 0.0099454432794882121),
 ('month', 0.019059973731764861),
 ('Util_Indx_BBL', 0.022666521158687968),
 ('pct_metro_coverage', 0.052894217967523235),
 ('countBBL_prev_month', 0.186316185172337

In [25]:
#GBR  
blist = outcomes[3] #this number is how we select which regressor
print(blist[0])
prms = GradientBoostingRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

GradientBoostingRegressor
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=8,
             min_weight_fraction_leaf=0.0, n_estimators=100, presort=False,
             random_state=None, subsample=1.0, verbose=0, warm_start=False)

Score on test data:
-0.250128459647

[  5.56617334e-02   1.81699227e-01   7.45677046e-02   1.83987647e-01
   6.00724709e-02   4.84236430e-02   4.08526846e-02   2.89239906e-02
   6.58611963e-03   8.36870917e-04   6.51367260e-03   3.14502843e-03
   7.63963362e-03   5.48118033e-05   2.31632182e-03   5.28912226e-02
   5.47158958e-02   1.69925258e-01   0.00000000e+00   6.70280467e-05
   3.82052015e-04   1.47921737e-02   0.00000000e+00   3.08768523e-04
   3.12044325e-03   2.51559838e-03]


In [26]:
gbr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(gbr_anc_beta, key=lambda x: x[1])

[('NAME_Ward 1', 0.0),
 ('NAME_Ward 5', 0.0),
 ('HOTELS_Concentr', 5.4811803299193319e-05),
 ('NAME_Ward 2', 6.7028046736908895e-05),
 ('NAME_Ward 6', 0.00030876852278285174),
 ('NAME_Ward 3', 0.00038205201521101574),
 ('PHARM_Concentr', 0.00083687091668497139),
 ('METRO_Concentr', 0.0023163218198976043),
 ('NAME_Ward 8', 0.0025155983793881207),
 ('NAME_Ward 7', 0.0031204432498475303),
 ('BANKS_Concentr', 0.0031450284308498343),
 ('GROC_Concentr', 0.0065136726039651434),
 ('LIQUOR_Concentr', 0.0065861196325634501),
 ('CLUBS_Concentr', 0.0076396336218830119),
 ('NAME_Ward 4', 0.014792173691443341),
 ('GS_GRANTS_Concentr', 0.028923990578859068),
 ('BIZ_Dist_Concentr', 0.040852684553990465),
 ('Q_GDP', 0.04842364300346258),
 ('pct_metro_coverage', 0.05289122255586224),
 ('countBBL_prev_month', 0.054715895752721302),
 ('Util_Indx_BBL', 0.055661733411353921),
 ('SALEPRICE', 0.060072470872531127),
 ('countIssued', 0.074567704564187445),
 ('countBBL_prev_cycle', 0.16992525798808281),
 ('count

# Charts

In [None]:
#Adapted from https://pythonspot.com/en/matplotlib-bar-chart/

objects     = [j[0] for j in outcomes]
y_pos       = np.arange(len(objects))

performance = [j[-1] for j in outcomes]
for jm in range(len(performance)):
    if performance[jm] < 0:
        performance[jm] = 0
performance

In [None]:
plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('R2 Score')
ti = "Scoring across models for "+shapef+", lagging by "+str(shiftmonths)+ " months."
plt.title(ti)
fl = './plots/' + shapef + "_shift" + str(shiftmonths)
plt.savefig(fl)

# Everything below is exploratory analysis for me.

In [None]:
for jm in range(0, 5):
    
    print(outcomes[jm][0])
    
    print(outcomes[jm][1])
    print(outcomes[jm][4])

In [None]:
best = AdaBoostRegressor(learning_rate=1, loss='square', n_estimators=60)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[0][0])
print(bestscore)

best = RandomForestRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[1][0])
print(bestscore)

best = SVR(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[2][0])
print(bestscore)


best = GradientBoostingRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[3][0])
print(bestscore)

In [None]:


Xtrain = dat_xtrain.drop(['y'], axis=1)
y16 = dat_ytrain['y']
X15 = dat15.drop(['y'], axis=1)
y15 = dat15['y']

fitted    = outcomes[-2][3].fit(X15, y15)
predicted = fitted.predict(X16)

In [None]:
pred = pd.DataFrame(predicted, columns=['predicted'])
dat16 = dat16.reset_index()
pred['y'] = dat16['y']

In [None]:
def flagger_ranges(pred):
    pred['flag15'] = 0
    pred['flag15'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag05'] = 0
    pred['flag05'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag10'] = 0
    pred['flag10'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag_others']= 0
    pred['flag_others'][pred['flag05'] == 0] = 1
    return pred
pred = flagger_ranges(pred)
pred