In [1]:
import fiona
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import geopandas as gpd
from shapely.geometry import Point, Polygon

import zipfile
import requests
import os
import shutil

from sklearn.preprocessing import *

from lnks import scl_cols

#import tensorflow as tf

%matplotlib inline

import warnings #DANGER: I triggered a ton of warnings.
warnings.filterwarnings('ignore')

np.random.seed()

from statistics import mean, median

# To start...

We import the data output of our data pipeline. We reset the index, drop index columns, and lag the data.

We explicitly print shape several times, making sure that we capture the magnitude of data lost from dropping NA values.

In [2]:
#Shift and shape vars
shiftmonths = 6
shapef = 'anc'
#Assign the split for holdout data.
holdout_date = 2015.5
#Get data
filestring = './data/'+shapef+'_out.csv'
df = pd.read_csv(filestring)
df = df.sort_values(['month', 'NAME'])# , 'ANC'])
df = df.reset_index(drop=True)
len(df.NAME.unique())

40

Now we examine the columns and lag the data.

In [3]:
df.columns

Index(['Unnamed: 0', 'NAME', 'Util_Indx_BBL', 'countBBL', 'countIssued',
       'month', 'SALEPRICE', 'Q_GDP', 'BIZ_Dist_Concentr',
       'GS_GRANTS_Concentr', 'LIQUOR_Concentr', 'PHARM_Concentr',
       'GROC_Concentr', 'BANKS_Concentr', 'CLUBS_Concentr', 'HOTELS_Concentr',
       'METRO_Concentr', 'pct_metro_coverage'],
      dtype='object')

In [4]:
print(df.shape)
shiftnum= (((len(df.NAME.unique()))*(shiftmonths)))

#Also generate some lagged y data in the opposite direction.
df['y']= df['countBBL'].shift(-shiftnum)
df['countBBL_prev_month'] = df['countBBL'].shift((len(df.NAME.unique())))
df['countBBL_prev_cycle'] = df['countBBL'].shift((shiftnum))
df = df[shiftnum:-(shiftnum+(len(df.NAME.unique())))]
df = df.dropna()
df.shape

(2400, 18)


(1806, 21)

The next cell cleans out vestigial columns and drops/fills/expands to dummies for our NA and categorical values.

In [5]:
df = pd.get_dummies(df, columns=['NAME'])
df = df.drop(['Unnamed: 0'], axis= 1)
print(df.shape)
df = df.astype('float')

df = df.dropna()
print(df.shape)

(1806, 59)
(1806, 59)


Here we start building our grid search inputs, beginning with the splits.

In [6]:
#Flexible adaptation of Dr. Braman's interactive gridsearch script
#implementation. 
#TODO Clean up and streamline
import sklearn
from sklearn.neural_network import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.gaussian_process import *
from sklearn.gaussian_process.kernels import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
import random

#Frame up some separate DataFrames for scalar and stuff
scl_data = data = df



data     = data.reset_index(drop=True)
X = data.drop(['y'], axis=1)
y = data['y']

XH_train = data[data['month'] <= holdout_date-1]
yH_train = XH_train['y']
XH_train = XH_train.drop(['y'], axis=1)

XH_val = scl_data[scl_data['month'] >= holdout_date-1]
XH_val = XH_val[XH_val['month'] <= holdout_date]

yH_val = XH_val['y']
XH_val = XH_val.drop(['y'], axis=1)

XH_test  = data[data['month'] >= holdout_date]
yH_test  = XH_test['y']
XH_test  = XH_test.drop(['y'], axis=1)

ytr = sklearn.preprocessing.MinMaxScaler([0, 1]
            ).fit(y)
y = ytr.fit_transform(y)
y = pd.DataFrame(y, columns=['y'])

scl_data = scl_data.reset_index(drop=True)

In [7]:
y.y

0       0.080640
1       0.111329
2       0.095658
3       0.023833
4       0.290565
5       0.146915
6       0.003591
7       0.104146
8       0.105126
9       0.059092
10      0.047013
11      0.037871
12      0.027424
13      0.017630
14      0.031342
15      0.071172
16      0.068560
17      0.039177
18      0.021548
19      0.036565
20      0.091414
21      0.077375
22      0.094352
23      0.085211
24      0.112635
25      0.099576
26      0.075743
27      0.051257
28      0.055501
29      0.027751
          ...   
1776    0.174665
1777    0.244205
1778    0.235390
1779    0.142018
1780    0.133203
1781    0.083578
1782    0.139732
1783    0.249102
1784    0.233758
1785    0.158015
1786    0.135162
1787    0.164871
1788    0.339536
1789    0.290238
1790    0.389161
1791    0.336272
1792    0.457721
1793    0.388182
1794    0.334966
1795    0.307542
1796    0.183480
1797    0.140059
1798    0.159321
1799    0.153771
1800    0.113614
1801    0.202742
1802    0.093373
1803    0.1661

In [8]:
print(scl_data.month.max())
print(scl_data.shape)
scl_data = scl_data.dropna()
print(scl_data.shape)
sXH_train = scl_data[scl_data['month'] <= holdout_date-1]
syH_train = sXH_train['y']
sXH_train = sXH_train.drop(['y'], axis=1)

sXH_val = scl_data[scl_data['month'] >= holdout_date-1]
sXH_val = sXH_val[sXH_val['month'] <= holdout_date]

syH_val = sXH_val['y']
sXH_val = sXH_val.drop(['y'], axis=1)


sXH_test  = scl_data[scl_data['month'] >= holdout_date]
syH_test  = sXH_test['y']
sXH_test  = sXH_test.drop(['y'], axis=1)

2016.05
(1806, 59)
(1806, 59)


In [9]:
#Build scalers for the scl_data, other --------------------
scale_data_splits = [scl_data, sXH_train,sXH_test, syH_train, syH_test]
for scl_data in scale_data_splits:
    scaler = sklearn.preprocessing.StandardScaler(
                ).fit(scl_data)
    minmaxer = sklearn.preprocessing.MinMaxScaler([0, 1]
                ).fit(scl_data)

    scl = scaler.transform(scl_data)
    scl = minmaxer.transform(scl_data)
    try:
        scl_data = pd.DataFrame(scl, columns=scl_data.columns)
    except AttributeError as e:
        print(e)
        scl_data = pd.DataFrame(scl, columns=['y'])
    print(scl_data.shape)
    scl_data = scl_data.dropna()
    print(scl_data.shape)
    assert np.all(np.isfinite(scl_data))
    assert not np.any(np.isnan(scl_data))
    
    
#scl_data[scl_data.columns
#   ] = scaler.fit_transform(scl_data[scl_data.columns])

#----------------------------------------------------------


(1806, 59)
(1806, 59)
(1153, 58)
(1153, 58)
(192, 58)
(192, 58)
'Series' object has no attribute 'columns'
(1153, 1)
(1153, 1)
'Series' object has no attribute 'columns'
(192, 1)
(192, 1)


Let's make sure our data came out of the scalers intact:

In [10]:
y;

In [11]:
print(sXH_train.shape)
print(syH_train.shape)
print(sXH_test.shape)
print(syH_test.shape)


(1153, 58)
(1153,)
(192, 58)
(192,)


In [12]:
scl_data.columns

Index(['y'], dtype='object')

In [13]:
sX = scl_data.drop(['y'], axis=1)
sy = scl_data['y']



assert np.all(np.isfinite(X))
assert np.all(np.isfinite(y))
assert not np.any(np.isnan(X))
assert not np.any(np.isnan(y))

assert np.all(np.isfinite(sX))
assert np.all(np.isfinite(sy))
assert not np.any(np.isnan(sX))
assert not np.any(np.isnan(sy))

In [14]:
scl_data.columns

Index(['y'], dtype='object')

In [15]:
scl_data.describe()

Unnamed: 0,y
count,192.0
mean,0.217014
std,0.159491
min,0.0
25%,0.110775
50%,0.165156
75%,0.307066
max,1.0


This cell contains our a crude RNG, a list of regressors which benefit from scaled data, and hardcoded data used to generate our param_grid, et cetera.

In [16]:
#Make a short list of random states to insert into randomstate params.
scrambler = []
for scram in range(0, 10):
    scrambler.append(random.randint(0, 10000))   
print(scrambler)

to_scale = ['SVR']

names       = ['AdaBoostRegressor',
             'RandomForestRegressor',
             'SVR',
             #'KNeighborsRegressor',
             #'BaggingRegressor',
             'GradientBoostingRegressor',
             #'LinearRegression',
             #'MLPRegressor',
             #'SGDRegressor',
             'LassoLars'         
    
]

regressors = [AdaBoostRegressor(),
              RandomForestRegressor(),
              SVR(),
              #KNeighborsRegressor(),
              #BaggingRegressor(),
              GradientBoostingRegressor(),
              #LinearRegression(),
              #MLPRegressor(),
              #SGDRegressor(),
              LassoLars()
    
]

param_grids =[ 
    ['AdaBoostRegressor', dict(
        n_estimators=[80, 60, 30],
        learning_rate=[1, .5, .01],
        loss=['linear', 'square', 'exponential'],
        #random_state=scrambler[3:5]
        
    )],
        
    ['RandomForestRegressor', dict(
        max_depth=[5, 10, 15],
        criterion=['mse', 'mae'],
        #random_state=scrambler[:2]
    )],
    ['SVR', dict( #Most params for SVR are turned off right now, too expensive
        C=[1, .9],
        epsilon=[.1, .05],
        #kernel=['poly']
    )],
    ['GradientBoostingRegressor', dict(
        max_depth=[3, 6, 9, 12],
        min_samples_split=[2, 4, 8],
        presort=[False]
    )],
    ['LassoLars', dict(
        alpha=[0.1, 1, .5, .75],
        #random_state=[random.randint(0, 10000)]
    )],
    ]

[1808, 4760, 4545, 1472, 5015, 7827, 3079, 9442, 9846, 8522]


## Grid Search:

Here we implement an iterator that executes GridSearchCV and reports the best explained variance. The best_params attribute is then extracted, and used those on the whole training set, then predict on the holdout data.

Testing indicates that for some models, the fit on our full dataset modestly outperforms the CV regularly.

In [17]:
outcomes = []

for name, rgsr in zip(names, regressors):
    
    for item in param_grids:
        if item[0]==name:
            print(name + ':')
            params= item[1]
        
    
    cv = sklearn.model_selection.GridSearchCV(rgsr, param_grid=params,
                                              verbose=True, n_jobs=12,
                                              cv=3, pre_dispatch="2*n_jobs")
    
    if name not in to_scale:
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(X, y)
        fitted = cv.fit(XH_train, yH_train)
        score = cv.score(XH_val, yH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(XH_train, yH_train)
        bestscore = best.score(XH_test, yH_test)
    if name in to_scale:
    #TODO: fix
        #X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(sX, sy)
        fitted = cv.fit(sXH_train, syH_train)
        score = cv.score(sXH_val, syH_val)
        print(score)

        best = rgsr.set_params(**cv.best_params_)
        bestfit= best.fit(sXH_train, syH_train)
        bestscore = best.score(sXH_test, syH_test)

    print(name + " R2 with best model, score:")
    print(bestscore)
    
    outcomes.append((name, score, cv.cv_results_, cv.best_estimator_, 
                     cv.best_params_, bestscore, [yH_test, ]))
    
for nm in range(0, len(outcomes)):
    print()
    print(outcomes[nm][0])
    print(outcomes[nm][1])

    print()
    print('Best on real:')
    print(outcomes[nm][-1])
    
    

AdaBoostRegressor:
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    4.2s
[Parallel(n_jobs=12)]: Done  81 out of  81 | elapsed:   11.5s finished


0.665054286801
AdaBoostRegressor R2 with best model, score:
0.162457941236
RandomForestRegressor:
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=12)]: Done  14 out of  18 | elapsed:    3.2s remaining:    0.9s
[Parallel(n_jobs=12)]: Done  18 out of  18 | elapsed:    3.6s finished


0.656981418417
RandomForestRegressor R2 with best model, score:
0.262904394844
SVR:
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.7s remaining:    3.6s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    1.2s finished


-0.473240311313
SVR R2 with best model, score:
-1.2219649632
GradientBoostingRegressor:
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=12)]: Done  36 out of  36 | elapsed:   12.9s finished


0.639589177737
GradientBoostingRegressor R2 with best model, score:
0.270727272141
LassoLars:
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.6s finished


0.710192212102
LassoLars R2 with best model, score:
0.684282362284

AdaBoostRegressor
0.665054286801

Best on real:
[1614     880.0
1615    1126.0
1616     935.0
1617     285.0
1618    2597.0
1619    1063.0
1620    1122.0
1621     480.0
1622     656.0
1623     636.0
1624     406.0
1625     374.0
1626     248.0
1627     398.0
1628     678.0
1629     667.0
1630     431.0
1631     381.0
1632     475.0
1633     836.0
1634     738.0
1635    1033.0
1636     888.0
1637    1197.0
1638    1052.0
1639     842.0
1640     859.0
1641     520.0
1642     378.0
1643     455.0
         ...  
1776     564.0
1777     777.0
1778     750.0
1779     464.0
1780     437.0
1781     285.0
1782     457.0
1783     792.0
1784     745.0
1785     513.0
1786     443.0
1787     534.0
1788    1069.0
1789     918.0
1790    1221.0
1791    1059.0
1792    1431.0
1793    1218.0
1794    1055.0
1795     971.0
1796     591.0
1797     458.0
1798     517.0
1799     500.0
1800     377.0
1801     650.0
1802     315.0
1803     538.

# Data Analysis - ANC 

In [18]:
data.corr()['y'].sort_values()

NAME_ANC 2D           -0.157043
NAME_ANC 3G           -0.127025
NAME_ANC 8B           -0.120614
NAME_ANC 1D           -0.111773
NAME_ANC 8D           -0.110460
NAME_ANC 8E           -0.110459
NAME_ANC 7F           -0.107834
NAME_ANC 5A           -0.103933
Q_GDP                 -0.094681
NAME_ANC 7E           -0.093497
NAME_ANC 7C           -0.092511
NAME_ANC 3F           -0.087482
NAME_ANC 4A           -0.083163
NAME_ANC 3B           -0.067292
NAME_ANC 8C           -0.066835
NAME_ANC 4D           -0.066473
NAME_ANC 5B           -0.065804
NAME_ANC 3E           -0.062881
NAME_ANC 7D           -0.054043
NAME_ANC 7B           -0.053850
NAME_ANC 3D           -0.034633
NAME_ANC 8A           -0.029727
NAME_ANC 3C           -0.021149
NAME_ANC 4B           -0.001649
NAME_ANC 4C            0.004583
NAME_ANC 6E            0.012039
NAME_ANC 5D            0.035697
NAME_ANC 6D            0.036477
NAME_ANC 5C            0.057430
NAME_ANC 6A            0.060772
NAME_ANC 1C            0.069525
NAME_ANC

In [19]:
#LassoLARS
blist = outcomes[-1] #this number is how we select which regressor
print(blist[0])
prms = LassoLars(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.coef_path_) #Or whatever other attribute you want

LassoLars
LassoLars(alpha=0.1, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

Score on test data:
0.684282362284

[[    0.             0.             0.             0.             0.             0.
      0.             0.             0.             0.        ]
 [    0.          6216.72201882  6256.80861373  6265.69767498
   6271.48797788  6269.59571783  6257.96561371  6235.46600107
   6139.04689839  6058.37030459]
 [    0.             0.             0.             0.             0.             0.
      0.             0.             0.             0.        ]
 [    0.             0.             0.             0.            -8.44824813
    -39.33430977   -79.52376821   -85.33632522  -100.16624932
   -106.70627799]
 [    0.             0.             0.             0.             0.             0.
      0.             0.             0.             0.        ]
 [    0.   

In [20]:
lars_anc_beta = [i for i in zip(XH_test.columns, prms.coef_)]
sorted(lars_anc_beta, key=lambda x: x[1])

[('month', -4.2591021139608349),
 ('Util_Indx_BBL', 0.0),
 ('countIssued', 0.0),
 ('SALEPRICE', 0.0),
 ('Q_GDP', 0.0),
 ('PHARM_Concentr', 0.0),
 ('GROC_Concentr', 0.0),
 ('BANKS_Concentr', 0.0),
 ('CLUBS_Concentr', 0.0),
 ('HOTELS_Concentr', 0.0),
 ('METRO_Concentr', 0.0),
 ('countBBL_prev_cycle', 0.0),
 ('NAME_ANC 1A', 0.0),
 ('NAME_ANC 1B', 0.0),
 ('NAME_ANC 1C', 0.0),
 ('NAME_ANC 1D', 0.0),
 ('NAME_ANC 2A', 0.0),
 ('NAME_ANC 2C', 0.0),
 ('NAME_ANC 2D', 0.0),
 ('NAME_ANC 2E', 0.0),
 ('NAME_ANC 3B', 0.0),
 ('NAME_ANC 3C', 0.0),
 ('NAME_ANC 3D', 0.0),
 ('NAME_ANC 3E', 0.0),
 ('NAME_ANC 3F', 0.0),
 ('NAME_ANC 3G', 0.0),
 ('NAME_ANC 4A', 0.0),
 ('NAME_ANC 4B', 0.0),
 ('NAME_ANC 4C', 0.0),
 ('NAME_ANC 4D', 0.0),
 ('NAME_ANC 5A', 0.0),
 ('NAME_ANC 5B', 0.0),
 ('NAME_ANC 5C', 0.0),
 ('NAME_ANC 5D', 0.0),
 ('NAME_ANC 5E', 0.0),
 ('NAME_ANC 6A', 0.0),
 ('NAME_ANC 6B', 0.0),
 ('NAME_ANC 6C', 0.0),
 ('NAME_ANC 6D', 0.0),
 ('NAME_ANC 6E', 0.0),
 ('NAME_ANC 7B', 0.0),
 ('NAME_ANC 7C', 0.0),
 ('N

In [21]:
#AdaBoost
blist = outcomes[0] #this number is how we select which regressor
print(blist[0])
prms = AdaBoostRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

AdaBoostRegressor
AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='square',
         n_estimators=80, random_state=None)

Score on test data:
0.175647234336

[  1.81794620e-03   2.93414370e-01   4.09508032e-04   5.32752129e-03
   1.49346362e-03   2.50407895e-04   2.51404405e-03   1.27202622e-03
   5.22126972e-02   4.20296912e-02   3.48410873e-02   2.80261711e-02
   2.93298365e-02   4.15368900e-02   4.18834652e-02   5.59328168e-03
   2.20146763e-01   1.49228190e-01   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   3.92691286e-02
   0.00000000e+00   0.00000000e+00   0.00000000e+00   9.40351091e-03
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
 

In [22]:
ada_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(ada_anc_beta, key=lambda x: x[1])

[('NAME_ANC 1A', 0.0),
 ('NAME_ANC 1B', 0.0),
 ('NAME_ANC 1C', 0.0),
 ('NAME_ANC 1D', 0.0),
 ('NAME_ANC 2A', 0.0),
 ('NAME_ANC 2C', 0.0),
 ('NAME_ANC 2D', 0.0),
 ('NAME_ANC 2E', 0.0),
 ('NAME_ANC 3B', 0.0),
 ('NAME_ANC 3C', 0.0),
 ('NAME_ANC 3D', 0.0),
 ('NAME_ANC 3E', 0.0),
 ('NAME_ANC 3F', 0.0),
 ('NAME_ANC 3G', 0.0),
 ('NAME_ANC 4A', 0.0),
 ('NAME_ANC 4B', 0.0),
 ('NAME_ANC 4C', 0.0),
 ('NAME_ANC 4D', 0.0),
 ('NAME_ANC 5A', 0.0),
 ('NAME_ANC 5B', 0.0),
 ('NAME_ANC 5C', 0.0),
 ('NAME_ANC 5D', 0.0),
 ('NAME_ANC 5E', 0.0),
 ('NAME_ANC 6A', 0.0),
 ('NAME_ANC 6B', 0.0),
 ('NAME_ANC 6C', 0.0),
 ('NAME_ANC 6D', 0.0),
 ('NAME_ANC 6E', 0.0),
 ('NAME_ANC 7B', 0.0),
 ('NAME_ANC 7C', 0.0),
 ('NAME_ANC 7D', 0.0),
 ('NAME_ANC 7E', 0.0),
 ('NAME_ANC 7F', 0.0),
 ('NAME_ANC 8A', 0.0),
 ('NAME_ANC 8B', 0.0),
 ('NAME_ANC 8C', 0.0),
 ('NAME_ANC 8D', 0.0),
 ('NAME_ANC 8E', 0.0),
 ('Q_GDP', 0.00025040789524748798),
 ('countIssued', 0.00040950803197946893),
 ('GS_GRANTS_Concentr', 0.0012720262171296783),


In [23]:
#RFR 
blist = outcomes[1] #this number is how we select which regressor
print(blist[0])
prms = RandomForestRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

RandomForestRegressor
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Score on test data:
0.251312055252

[  2.02283447e-03   6.60635315e-01   3.19863454e-04   3.11861042e-03
   5.45470030e-04   2.48135117e-04   8.39666823e-04   4.87389525e-04
   5.38760575e-02   5.65097495e-05   5.56233443e-02   2.47953378e-02
   3.18914424e-04   2.36092051e-04   3.24489015e-02   7.52398985e-04
   9.90851257e-02   3.22376369e-02   1.12362002e-05   1.53055984e-05
   2.88424908e-04   1.12635579e-06   8.19950879e-08   3.09755644e-02
   0.00000000e+00   1.40389122e-04   4.23306699e-07   3.20073998e-04
   4.85376039e-06   2.01018010e-06   5.02048269e-06   3.65741931e-06
   7.48038225e-06   1.91383144e-05

In [24]:
rfr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(rfr_anc_beta, key=lambda x: x[1])

[('NAME_ANC 2C', 0.0),
 ('NAME_ANC 2A', 8.1995087905884305e-08),
 ('NAME_ANC 8B', 1.0840195467495942e-07),
 ('NAME_ANC 5D', 3.2304648317395513e-07),
 ('NAME_ANC 2E', 4.2330669915968555e-07),
 ('NAME_ANC 4C', 4.7093578091800742e-07),
 ('NAME_ANC 1D', 1.1263557943792851e-06),
 ('NAME_ANC 3C', 2.0101800963179686e-06),
 ('NAME_ANC 6B', 2.2834485010441945e-06),
 ('NAME_ANC 8A', 2.5924143289289887e-06),
 ('NAME_ANC 7E', 2.9430940330511685e-06),
 ('NAME_ANC 5A', 3.042328149762627e-06),
 ('NAME_ANC 7F', 3.6519816892099328e-06),
 ('NAME_ANC 3E', 3.6574193115077753e-06),
 ('NAME_ANC 5B', 3.8715658589832485e-06),
 ('NAME_ANC 4A', 4.2747785048551197e-06),
 ('NAME_ANC 3B', 4.8537603859206763e-06),
 ('NAME_ANC 5C', 4.9079614203675476e-06),
 ('NAME_ANC 3D', 5.0204826874746332e-06),
 ('NAME_ANC 5E', 5.9404411587467802e-06),
 ('NAME_ANC 3F', 7.4803822549498193e-06),
 ('NAME_ANC 4B', 7.7445735664377928e-06),
 ('NAME_ANC 7D', 9.035127318528327e-06),
 ('NAME_ANC 4D', 9.5495700314294917e-06),
 ('NAME_ANC 8

In [25]:
#GBR  
blist = outcomes[3] #this number is how we select which regressor
print(blist[0])
prms = GradientBoostingRegressor(**blist[4])
prms = prms.fit(sXH_train, syH_train)
print(prms)
print()
print('Score on test data:')
print(prms.score(XH_test, yH_test))
pred  = prms.predict(XH_test)
print()
print(prms.feature_importances_) #Or whatever other attribute you want

GradientBoostingRegressor
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=8,
             min_weight_fraction_leaf=0.0, n_estimators=100, presort=False,
             random_state=None, subsample=1.0, verbose=0, warm_start=False)

Score on test data:
0.276460308395

[ 0.06554758  0.17594069  0.01762628  0.16832463  0.02560154  0.03097537
  0.01940928  0.02778066  0.00151913  0.00926952  0.00829651  0.0082839
  0.01168314  0.00438671  0.00679253  0.04777374  0.10553871  0.13935976
  0.          0.          0.          0.          0.          0.00518118
  0.          0.00081705  0.01667949  0.01750668  0.00030081  0.00071238
  0.01018708  0.00586049  0.          0.00183213  0.00232989  0.
  0.00595856  0.          0.          0.          0.00129933  0.
  0.00922482  0.          0.

In [26]:
gbr_anc_beta = [i for i in zip(XH_test.columns, prms.feature_importances_)]
sorted(gbr_anc_beta, key=lambda x: x[1])

[('NAME_ANC 1A', 0.0),
 ('NAME_ANC 1B', 0.0),
 ('NAME_ANC 1C', 0.0),
 ('NAME_ANC 1D', 0.0),
 ('NAME_ANC 2A', 0.0),
 ('NAME_ANC 2C', 0.0),
 ('NAME_ANC 3F', 0.0),
 ('NAME_ANC 4B', 0.0),
 ('NAME_ANC 4D', 0.0),
 ('NAME_ANC 5A', 0.0),
 ('NAME_ANC 5B', 0.0),
 ('NAME_ANC 5D', 0.0),
 ('NAME_ANC 6A', 0.0),
 ('NAME_ANC 7C', 0.0),
 ('NAME_ANC 7D', 0.0),
 ('NAME_ANC 7E', 0.0),
 ('NAME_ANC 7F', 0.0),
 ('NAME_ANC 8A', 0.0),
 ('NAME_ANC 8B', 0.0),
 ('NAME_ANC 8D', 0.0),
 ('NAME_ANC 3B', 0.00030081177003266481),
 ('NAME_ANC 8E', 0.00055777523938999171),
 ('NAME_ANC 3C', 0.00071237899636898293),
 ('NAME_ANC 2D', 0.00081705238811436982),
 ('NAME_ANC 5C', 0.0012993277478813947),
 ('LIQUOR_Concentr', 0.0015191344268845513),
 ('NAME_ANC 6B', 0.0015655727960165915),
 ('NAME_ANC 6C', 0.00171151931160489),
 ('NAME_ANC 3G', 0.0018321263033114765),
 ('NAME_ANC 4A', 0.0023298923169682833),
 ('HOTELS_Concentr', 0.0043867072209121992),
 ('NAME_ANC 2B', 0.0051811786860568665),
 ('NAME_ANC 3E', 0.0058604935313703085

# Charts

In [None]:
#Adapted from https://pythonspot.com/en/matplotlib-bar-chart/

objects     = [j[0] for j in outcomes]
y_pos       = np.arange(len(objects))

performance = [j[-1] for j in outcomes]
for jm in range(len(performance)):
    if performance[jm] < 0:
        performance[jm] = 0
performance

In [None]:
plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('R2 Score')
ti = "Scoring across models for "+shapef+", lagging by "+str(shiftmonths)+ " months."
plt.title(ti)
fl = './plots/' + shapef + "_shift" + str(shiftmonths)
plt.savefig(fl)

# Everything below is exploratory analysis for me.

In [None]:
for jm in range(0, 5):
    
    print(outcomes[jm][0])
    
    print(outcomes[jm][1])
    print(outcomes[jm][4])

In [None]:
best = AdaBoostRegressor(learning_rate=1, loss='square', n_estimators=60)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[0][0])
print(bestscore)

best = RandomForestRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[1][0])
print(bestscore)

best = SVR(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[2][0])
print(bestscore)


best = GradientBoostingRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[3][0])
print(bestscore)

In [None]:


Xtrain = dat_xtrain.drop(['y'], axis=1)
y16 = dat_ytrain['y']
X15 = dat15.drop(['y'], axis=1)
y15 = dat15['y']

fitted    = outcomes[-2][3].fit(X15, y15)
predicted = fitted.predict(X16)

In [None]:
pred = pd.DataFrame(predicted, columns=['predicted'])
dat16 = dat16.reset_index()
pred['y'] = dat16['y']

In [None]:
def flagger_ranges(pred):
    pred['flag15'] = 0
    pred['flag15'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag05'] = 0
    pred['flag05'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag10'] = 0
    pred['flag10'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
                  ] = 1
    pred['flag_others']= 0
    pred['flag_others'][pred['flag05'] == 0] = 1
    return pred
pred = flagger_ranges(pred)
pred