In [1]:
import fiona
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import geopandas as gpd
from shapely.geometry import Point, Polygon

import zipfile
import requests
import os
import shutil

from sklearn.preprocessing import *

from lnks import scl_cols

#import tensorflow as tf

%matplotlib inline

import warnings #DANGER: I triggered a ton of warnings.
warnings.filterwarnings('ignore')

np.random.seed()

from statistics import mean, median

# To start...

We import the data output of our data pipeline. We reset the index, drop index columns, and lag the data.

We explicitly print shape several times, making sure that we capture the magnitude of data lost from dropping NA values.

In [2]:
#Shift and shape vars
shiftmonths = 6
shapef = 'ward'
#Assign the split for holdout data.
holdout_date = 2015.5
#Get data
filestring = './data/'+shapef+'_out.csv'
df = pd.read_csv(filestring)
df = df.sort_values(['month', 'NAME'])# , 'ANC'])
df = df.reset_index(drop=True)
len(df.NAME.unique())

48

Now we examine the columns and lag the data.

In [3]:
df.columns

Index(['Unnamed: 0', 'NAME', 'Util_Indx_BBL', 'countBBL', 'countIssued',
       'month', 'SALEPRICE', 'Q_GDP', 'BIZ_Dist_Concentr',
       'GS_GRANTS_Concentr', 'LIQUOR_Concentr', 'PHARM_Concentr',
       'GROC_Concentr', 'BANKS_Concentr', 'CLUBS_Concentr', 'HOTELS_Concentr',
       'METRO_Concentr', 'pct_metro_coverage'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,NAME,Util_Indx_BBL,countBBL,countIssued,month,SALEPRICE,Q_GDP,BIZ_Dist_Concentr,GS_GRANTS_Concentr,LIQUOR_Concentr,PHARM_Concentr,GROC_Concentr,BANKS_Concentr,CLUBS_Concentr,HOTELS_Concentr,METRO_Concentr,pct_metro_coverage
0,17,ANC 1A,0.038227,213.0,1.0,2012.01,,0.663452,0.049174,0.981043,5572,5572,5572,5572,5572,5572,5572,0.307071
1,18,ANC 1B,0.045447,267.0,4.0,2012.01,727000.0,0.663452,0.029787,0.979311,5875,5875,5875,5875,5875,5875,5875,0.436447
2,0,ANC 1C,0.054142,200.0,1.0,2012.01,1986000.0,0.663452,0.998646,0.927854,3694,3694,3694,3694,3694,3694,3694,0.0
3,1,ANC 1D,0.030303,66.0,0.0,2012.01,783000.0,0.663452,0.264922,0.692668,2178,2178,2178,2178,2178,2178,2178,0.0
4,2,ANC 2A,0.068413,235.0,3.0,2012.01,158000000.0,0.663452,0.800974,0.0,3435,3435,3435,3435,3435,3435,3435,0.414556


In [5]:
print(df.shape)
shiftnum= (((len(df.NAME.unique()))*(shiftmonths)))

#Also generate some lagged y data in the opposite direction.
df['y']= df['countBBL'].shift(-shiftnum)
df['countBBL_prev_month'] = df['countBBL'].shift((len(df.NAME.unique())))
df['countBBL_prev_cycle'] = df['countBBL'].shift((shiftnum))
df = df[shiftnum:-(shiftnum+(len(df.NAME.unique())))]
df = df.dropna()
df.shape

(2880, 18)


(2182, 21)

The next cell cleans out vestigial columns and drops/fills/expands to dummies for our NA and categorical values.

In [6]:
df = df.drop('NAME', axis = 1)

In [7]:
df.columns

Index(['Unnamed: 0', 'Util_Indx_BBL', 'countBBL', 'countIssued', 'month',
       'SALEPRICE', 'Q_GDP', 'BIZ_Dist_Concentr', 'GS_GRANTS_Concentr',
       'LIQUOR_Concentr', 'PHARM_Concentr', 'GROC_Concentr', 'BANKS_Concentr',
       'CLUBS_Concentr', 'HOTELS_Concentr', 'METRO_Concentr',
       'pct_metro_coverage', 'y', 'countBBL_prev_month',
       'countBBL_prev_cycle'],
      dtype='object')

Here we start building our grid search inputs, beginning with the splits.

In [8]:
#Flexible adaptation of Dr. Braman's interactive gridsearch script
#implementation. 
#TODO Clean up and streamline
import sklearn
from sklearn.neural_network import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.gaussian_process import *
from sklearn.gaussian_process.kernels import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
import random

#Frame up some separate DataFrames for scalar and stuff
scl_data = data = df



data     = data.reset_index(drop=True)
X = data.drop(['y'], axis=1)
y = data['y']

XH_train = data[data['month'] <= holdout_date-1]
yH_train = XH_train['y']
XH_train = XH_train.drop(['y'], axis=1)

XH_val = scl_data[scl_data['month'] >= holdout_date-1]
XH_val = XH_val[XH_val['month'] <= holdout_date]

yH_val = XH_val['y']
XH_val = XH_val.drop(['y'], axis=1)

XH_test  = data[data['month'] >= holdout_date]
yH_test  = XH_test['y']
XH_test  = XH_test.drop(['y'], axis=1)

ytr = sklearn.preprocessing.MinMaxScaler([0, 1]
            ).fit(y)
y = ytr.fit_transform(y)
y = pd.DataFrame(y, columns=['y'])

scl_data = scl_data.reset_index(drop=True)

In [9]:
y.y

0       0.030214
1       0.041713
2       0.035841
3       0.008930
4       0.108869
5       0.055046
6       0.001346
7       0.039021
8       0.039388
9       0.022141
10      0.017615
11      0.014190
12      0.010275
13      0.006606
14      0.011743
15      0.026667
16      0.025688
17      0.014679
18      0.008073
19      0.013700
20      0.034251
21      0.028991
22      0.035352
23      0.031927
24      0.042202
25      0.037309
26      0.028379
27      0.019205
28      0.020795
29      0.010398
          ...   
2152    0.087584
2153    0.059205
2154    0.050642
2155    0.061774
2156    0.127217
2157    0.108746
2158    0.145810
2159    0.125994
2160    0.171498
2161    0.145443
2162    0.125505
2163    0.115229
2164    0.068746
2165    0.052477
2166    0.059694
2167    0.057615
2168    0.042569
2169    0.075963
2170    0.034985
2171    0.062263
2172    0.040734
2173    0.055780
2174    0.473884
2175    1.000000
2176    0.380428
2177    0.320000
2178    0.508379
2179    0.6957

In [10]:
print(scl_data.month.max())
print(scl_data.shape)
scl_data = scl_data.dropna()
print(scl_data.shape)
sXH_train = scl_data[scl_data['month'] <= holdout_date-1]
syH_train = sXH_train['y']
sXH_train = sXH_train.drop(['y'], axis=1)

sXH_val = scl_data[scl_data['month'] >= holdout_date-1]
sXH_val = sXH_val[sXH_val['month'] <= holdout_date]

syH_val = sXH_val['y']
sXH_val = sXH_val.drop(['y'], axis=1)


sXH_test  = scl_data[scl_data['month'] >= holdout_date]
syH_test  = sXH_test['y']
sXH_test  = sXH_test.drop(['y'], axis=1)

2016.05
(2182, 20)
(2182, 20)


In [11]:
#Build scalers for the scl_data, other --------------------
scale_data_splits = [scl_data, sXH_train,sXH_test, syH_train, syH_test]
for scl_data in scale_data_splits:
    scaler = sklearn.preprocessing.StandardScaler(
                ).fit(scl_data)
    minmaxer = sklearn.preprocessing.MinMaxScaler([0, 1]
                ).fit(scl_data)

    scl = scaler.transform(scl_data)
    scl = minmaxer.transform(scl_data)
    try:
        scl_data = pd.DataFrame(scl, columns=scl_data.columns)
    except AttributeError as e:
        print(e)
        scl_data = pd.DataFrame(scl, columns=['y'])
    print(scl_data.shape)
    scl_data = scl_data.dropna()
    print(scl_data.shape)
    assert np.all(np.isfinite(scl_data))
    assert not np.any(np.isnan(scl_data))
    
    
#scl_data[scl_data.columns
#   ] = scaler.fit_transform(scl_data[scl_data.columns])

#----------------------------------------------------------


(2182, 20)
(2182, 20)
(1393, 19)
(1393, 19)
(232, 19)
(232, 19)
'Series' object has no attribute 'columns'
(1393, 1)
(1393, 1)
'Series' object has no attribute 'columns'
(232, 1)
(232, 1)


Let's make sure our data came out of the scalers intact:

In [12]:
y;

In [13]:
print(sXH_train.shape)
print(syH_train.shape)
print(sXH_test.shape)
print(syH_test.shape)


(1393, 19)
(1393,)
(232, 19)
(232,)


In [14]:
scl_data.columns

Index(['y'], dtype='object')

In [15]:
sX = scl_data.drop(['y'], axis=1)
sy = scl_data['y']



assert np.all(np.isfinite(X))
assert np.all(np.isfinite(y))
assert not np.any(np.isnan(X))
assert not np.any(np.isnan(y))

assert np.all(np.isfinite(sX))
assert np.all(np.isfinite(sy))
assert not np.any(np.isnan(sX))
assert not np.any(np.isnan(sy))

In [16]:
scl_data.columns

Index(['y'], dtype='object')

In [17]:
scl_data.describe()

Unnamed: 0,y
count,232.0
mean,0.144699
std,0.176489
min,0.0
25%,0.042825
50%,0.08046
75%,0.138425
max,1.0


This cell contains our a crude RNG, a list of regressors which benefit from scaled data, and hardcoded data used to generate our param_grid, et cetera.

In [18]:
#Make a short list of random states to insert into randomstate params.
scrambler = []
for scram in range(0, 10):
    scrambler.append(random.randint(0, 10000))   
print(scrambler)

to_scale = ['SVR']

names       = ['AdaBoostRegressor',
             'RandomForestRegressor',
             'SVR',
             #'KNeighborsRegressor',
             #'BaggingRegressor',
             'GradientBoostingRegressor',
             #'LinearRegression',
             #'MLPRegressor',
             #'SGDRegressor',
             'LassoLars'         
    
]

regressors = [AdaBoostRegressor(),
              RandomForestRegressor(),
              SVR(),
              #KNeighborsRegressor(),
              #BaggingRegressor(),
              GradientBoostingRegressor(),
              #LinearRegression(),
              #MLPRegressor(),
              #SGDRegressor(),
              LassoLars()
    
]

param_grids =[ 
    ['AdaBoostRegressor', dict(
        n_estimators=[80, 60, 30],
        learning_rate=[1, .5, .01],
        loss=['linear', 'square', 'exponential'],
        #random_state=scrambler[3:5]
        
    )],
        
    ['RandomForestRegressor', dict(
        max_depth=[5, 10, 15],
        criterion=['mse', 'mae'],
        #random_state=scrambler[:2]
    )],
    ['SVR', dict( #Most params for SVR are turned off right now, too expensive
        C=[1, .9],
        epsilon=[.1, .05],
        #kernel=['poly']
    )],
    ['GradientBoostingRegressor', dict(
        max_depth=[3, 6, 9, 12],
        min_samples_split=[2, 4, 8],
        presort=[False]
    )],
    ['LassoLars', dict(
        alpha=[0.1, 1, .5, .75],
        #random_state=[random.randint(0, 10000)]
    )],
    ]

[2086, 8363, 5711, 9027, 9616, 9636, 8469, 1299, 8386, 1374]


## Grid Search:

Here we implement an iterator that executes GridSearchCV and reports the best explained variance. The best_params attribute is then extracted, and used those on the whole training set, then predict on the holdout data.

Testing indicates that for some models, the fit on our full dataset modestly outperforms the CV regularly.

In [19]:
outcomes = []

for name, rgsr in zip(names, regressors):
    
    for item in param_grids:
        if item[0]==name:
            print(name + ':')
            params= item[1]
        
    
    cv = sklearn.model_selection.GridSearchCV(rgsr, param_grid=params,
                                              verbose=True, n_jobs=12,
                                              cv=3, pre_dispatch="2*n_jobs")
    
    if name not in to_scale:
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(X, y)
        fitted = cv.fit(XH_train, yH_train)
        score = cv.score(XH_val, yH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(XH_train, yH_train)
        bestscore = best.score(XH_test, yH_test)
    if name in to_scale:
    #TODO: fix
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(sX, sy)
        fitted = cv.fit(sXH_train, syH_train)
        score = cv.score(sXH_val, syH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(sXH_train, syH_train)
        bestscore = best.score(sXH_test, syH_test)

    print(name + " R2 with best model, score:")
    print(bestscore)
    
    outcomes.append((name, score, cv.cv_results_, cv.best_estimator_, 
                     cv.best_params_, bestscore, [yH_test, ]))
    
for nm in range(0, len(outcomes)):
    print()
    print(outcomes[nm][0])
    print(outcomes[nm][1])

    print()
    print('Best on real:')
    print(outcomes[nm][-1])
    
    

AdaBoostRegressor:
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    3.8s
[Parallel(n_jobs=12)]: Done  81 out of  81 | elapsed:   10.9s finished


0.815357721573
AdaBoostRegressor R2 with best model, score:
0.650823455193
RandomForestRegressor:
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=12)]: Done  14 out of  18 | elapsed:    3.1s remaining:    0.9s
[Parallel(n_jobs=12)]: Done  18 out of  18 | elapsed:    3.5s finished


0.8217710319
RandomForestRegressor R2 with best model, score:
0.648432896449
SVR:
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.6s remaining:    3.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.9s finished


-0.279852405194
SVR R2 with best model, score:
-0.486678210057
GradientBoostingRegressor:
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=12)]: Done  36 out of  36 | elapsed:   13.9s finished


0.816597014116
GradientBoostingRegressor R2 with best model, score:
0.646529772496
LassoLars:
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.5s finished


0.842413278411
LassoLars R2 with best model, score:
0.861735828473

AdaBoostRegressor
0.815357721573

Best on real:
[1950     880.0
1951    1126.0
1952     935.0
1953     285.0
1954    2597.0
1955    1063.0
1956    1122.0
1957     480.0
1958     656.0
1959     636.0
1960     406.0
1961     374.0
1962     248.0
1963     398.0
1964     678.0
1965     667.0
1966     431.0
1967     381.0
1968     475.0
1969     836.0
1970     738.0
1971    1033.0
1972     888.0
1973    1197.0
1974    1052.0
1975     842.0
1976     859.0
1977     520.0
1978     378.0
1979     455.0
         ...  
2152     745.0
2153     513.0
2154     443.0
2155     534.0
2156    1069.0
2157     918.0
2158    1221.0
2159    1059.0
2160    1431.0
2161    1218.0
2162    1055.0
2163     971.0
2164     591.0
2165     458.0
2166     517.0
2167     500.0
2168     377.0
2169     650.0
2170     315.0
2171     538.0
2172     362.0
2173     485.0
2174    3903.0
2175    8204.0
2176    3139.0
2177    2645.0
2178    4185.0
2179    5717.

# Data Analysis - ANC 

In [20]:
data.corr()['y'].sort_values()

Unnamed: 0            -0.427199
Q_GDP                 -0.059282
GS_GRANTS_Concentr     0.138242
SALEPRICE              0.244251
month                  0.269414
BIZ_Dist_Concentr      0.305893
pct_metro_coverage     0.316672
Util_Indx_BBL          0.458529
METRO_Concentr         0.766041
HOTELS_Concentr        0.766041
CLUBS_Concentr         0.766041
BANKS_Concentr         0.766041
LIQUOR_Concentr        0.766041
PHARM_Concentr         0.766041
GROC_Concentr          0.766041
countIssued            0.809520
countBBL_prev_cycle    0.926893
countBBL_prev_month    0.963903
countBBL               0.972928
y                      1.000000
Name: y, dtype: float64

In [21]:
#LassoLARS
blist = outcomes[-1] #this number is how we select which regressor
print(blist[0])
prms = LassoLars(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.coef_path_) #Or whatever other attribute you want

LassoLars
LassoLars(alpha=0.1, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

Score on test data:
0.861735828473

[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   1.90816167e+04   1.98453888e+04   1.98531337e+04
    1.99704971e+04   1.99704463e+04   1.99014869e+04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   3.04571657e+01
    1.33486371e+02   1.33685727e+02   2.11688346e+02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   -2.00748155e+02  -2.01109263e+02  -3.38025040e+02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0

In [22]:
lars_anc_beta = [i for i in zip(XH_test.columns, prms.coef_)]
sorted(lars_anc_beta, key=lambda x: x[1])

[('month', -12.270248109846495),
 ('Unnamed: 0', 0.0),
 ('Util_Indx_BBL', 0.0),
 ('SALEPRICE', 0.0),
 ('Q_GDP', 0.0),
 ('GS_GRANTS_Concentr', 0.0),
 ('PHARM_Concentr', 0.0),
 ('GROC_Concentr', 0.0),
 ('BANKS_Concentr', 0.0),
 ('CLUBS_Concentr', 0.0),
 ('HOTELS_Concentr', 0.0),
 ('METRO_Concentr', 0.0),
 ('countBBL_prev_month', 0.0),
 ('countBBL_prev_cycle', 0.0),
 ('LIQUOR_Concentr', 0.0033581870019101317),
 ('countIssued', 0.20059272489304059),
 ('countBBL', 1.0113707268605232),
 ('BIZ_Dist_Concentr', 6.4856998937977126),
 ('pct_metro_coverage', 10.567860821437002)]

In [23]:
#AdaBoost
blist = outcomes[0] #this number is how we select which regressor
print(blist[0])
prms = AdaBoostRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

AdaBoostRegressor
AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='square',
         n_estimators=60, random_state=None)

Score on test data:
0.639001698044

[  8.10945901e-05   5.05708069e-03   4.23808081e-01   0.00000000e+00
   3.26825190e-03   1.99173144e-04   2.69028472e-04   1.23716754e-02
   9.28362001e-03   3.05949710e-03   1.37342807e-03   3.89498659e-04
   1.27105978e-04   0.00000000e+00   2.43560199e-04   2.25904609e-04
   7.80708363e-03   1.78493079e-01   3.53942837e-01]


In [24]:
ada_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(ada_anc_beta, key=lambda x: x[1])

[('countIssued', 0.0),
 ('CLUBS_Concentr', 0.0),
 ('Unnamed: 0', 8.1094590078025809e-05),
 ('BANKS_Concentr', 0.00012710597815412771),
 ('SALEPRICE', 0.0001991731444612097),
 ('METRO_Concentr', 0.00022590460949200889),
 ('HOTELS_Concentr', 0.00024356019866830374),
 ('Q_GDP', 0.00026902847245823983),
 ('GROC_Concentr', 0.00038949865877234625),
 ('PHARM_Concentr', 0.0013734280710337519),
 ('LIQUOR_Concentr', 0.0030594971041761391),
 ('month', 0.0032682518996033079),
 ('Util_Indx_BBL', 0.0050570806899128155),
 ('pct_metro_coverage', 0.007807083632386011),
 ('GS_GRANTS_Concentr', 0.0092836200074869323),
 ('BIZ_Dist_Concentr', 0.01237167537067348),
 ('countBBL_prev_month', 0.1784930792698384),
 ('countBBL_prev_cycle', 0.35394283716916208),
 ('countBBL', 0.42380808113364293)]

In [25]:
#RFR 
blist = outcomes[1] #this number is how we select which regressor
print(blist[0])
prms = RandomForestRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

RandomForestRegressor
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Score on test data:
0.66670081616

[  2.71952232e-04   1.51398149e-03   7.25120999e-01   1.20615530e-04
   1.52890567e-03   2.25774475e-04   8.80281025e-05   1.47500237e-04
   7.73607779e-04   5.32695701e-05   1.86584598e-03   1.77683965e-05
   9.26199688e-05   2.09344833e-03   2.34968233e-03   1.67659283e-05
   1.45464045e-04   2.27405469e-01   3.61683022e-02]


In [26]:
rfr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(rfr_anc_beta, key=lambda x: x[1])

[('METRO_Concentr', 1.6765928325301168e-05),
 ('GROC_Concentr', 1.776839646357646e-05),
 ('LIQUOR_Concentr', 5.3269570085663532e-05),
 ('Q_GDP', 8.8028102527494529e-05),
 ('BANKS_Concentr', 9.2619968829200095e-05),
 ('countIssued', 0.00012061552991423581),
 ('pct_metro_coverage', 0.00014546404507784926),
 ('BIZ_Dist_Concentr', 0.00014750023748539052),
 ('SALEPRICE', 0.00022577447457509747),
 ('Unnamed: 0', 0.00027195223183552311),
 ('GS_GRANTS_Concentr', 0.00077360777944966166),
 ('Util_Indx_BBL', 0.0015139814911027363),
 ('month', 0.0015289056678235867),
 ('PHARM_Concentr', 0.0018658459754626448),
 ('CLUBS_Concentr', 0.0020934483342079963),
 ('HOTELS_Concentr', 0.0023496823259912267),
 ('countBBL_prev_cycle', 0.036168302239970404),
 ('countBBL_prev_month', 0.227405468745637),
 ('countBBL', 0.72512099895523552)]

In [27]:
#GBR  
blist = outcomes[3] #this number is how we select which regressor
print(blist[0])
prms = GradientBoostingRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

GradientBoostingRegressor
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100, presort=False,
             random_state=None, subsample=1.0, verbose=0, warm_start=False)

Score on test data:
0.650453063581

[ 0.02526476  0.0445745   0.25613845  0.05661967  0.16149232  0.04215735
  0.03799063  0.02799897  0.0239004   0.00576734  0.00496732  0.00623564
  0.01148616  0.00696566  0.0060399   0.00346679  0.02553598  0.13438211
  0.11901604]


In [29]:
gbr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(gbr_anc_beta, key=lambda x: x[1])

[('METRO_Concentr', 0.0034667949489451642),
 ('PHARM_Concentr', 0.0049673192658439089),
 ('LIQUOR_Concentr', 0.0057673425050446068),
 ('HOTELS_Concentr', 0.0060399040613221631),
 ('GROC_Concentr', 0.0062356411758471295),
 ('CLUBS_Concentr', 0.0069656613262664121),
 ('BANKS_Concentr', 0.011486159419538695),
 ('GS_GRANTS_Concentr', 0.023900400086495611),
 ('Unnamed: 0', 0.025264758903454893),
 ('pct_metro_coverage', 0.025535982765437698),
 ('BIZ_Dist_Concentr', 0.027998972698139252),
 ('Q_GDP', 0.037990634392939551),
 ('SALEPRICE', 0.042157346797145931),
 ('Util_Indx_BBL', 0.04457449752261957),
 ('countIssued', 0.056619668144072009),
 ('countBBL_prev_cycle', 0.11901603654866419),
 ('countBBL_prev_month', 0.13438210732230454),
 ('month', 0.16149231958774748),
 ('countBBL', 0.25613845252817119)]

# Charts

In [None]:
#Adapted from https://pythonspot.com/en/matplotlib-bar-chart/

objects     = [j[0] for j in outcomes]
y_pos       = np.arange(len(objects))

performance = [j[-1] for j in outcomes]
for jm in range(len(performance)):
    if performance[jm] < 0:
        performance[jm] = 0
performance

In [None]:
plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('R2 Score')
ti = "Scoring across models for "+shapef+", lagging by "+str(shiftmonths)+ " months."
plt.title(ti)
fl = './plots/' + shapef + "_shift" + str(shiftmonths)
plt.savefig(fl)

# Everything below is exploratory analysis for me.

In [None]:
for jm in range(0, 5):
    
    print(outcomes[jm][0])
    
    print(outcomes[jm][1])
    print(outcomes[jm][4])

In [None]:
best = AdaBoostRegressor(learning_rate=1, loss='square', n_estimators=60)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[0][0])
print(bestscore)

best = RandomForestRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[1][0])
print(bestscore)

best = SVR(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[2][0])
print(bestscore)


best = GradientBoostingRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[3][0])
print(bestscore)

In [None]:


Xtrain = dat_xtrain.drop(['y'], axis=1)
y16 = dat_ytrain['y']
X15 = dat15.drop(['y'], axis=1)
y15 = dat15['y']

fitted    = outcomes[-2][3].fit(X15, y15)
predicted = fitted.predict(X16)

In [None]:
pred = pd.DataFrame(predicted, columns=['predicted'])
dat16 = dat16.reset_index()
pred['y'] = dat16['y']

In [None]:
def flagger_ranges(pred):
    pred['flag15'] = 0
    pred['flag15'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag05'] = 0
    pred['flag05'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag10'] = 0
    pred['flag10'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag_others']= 0
    pred['flag_others'][pred['flag05'] == 0] = 1
    return pred
pred = flagger_ranges(pred)
pred