In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time, datetime
import data_wrangling as dw

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import LinearSVC



In [2]:
def time_mask(df, key = 'Proc.Start.Date', value = '01/01/08'):
    t = time.mktime(datetime.datetime.strptime(value,'%d/%m/%y').timetuple())
    return df[key] >= t

In [3]:
def split_df(df):
    mask = time_mask(df)
    
    finalDf_train = df[-mask]
    finalDf_test = df[mask]

    #creating X, y splits for test and train dataframes
    y_train = finalDf_train['Grant.Status'].values
    del finalDf_train['Grant.Status']
    del finalDf_train['Start.date']
    X_train = finalDf_train.values

    y_test = finalDf_test['Grant.Status'].values
    del finalDf_test['Grant.Status']
    del finalDf_test['Start.date']
    X_test = finalDf_test.values

    return X_train, y_train, X_test, y_test, finalDf_test, finalDf_train

In [4]:
orig = dw.get_tables()
munged = dw.munge_data(orig)
X_train, y_train, X_test, y_test, finalDf_test, finalDf_train = split_df(munged)

  if self.run_code(code, result):


In [72]:
def testing(X, y, X_holdout = None, y_holdout = None, a = None):
    if(a != None):
        train_labels = np.arange(X.shape[0])
        test_labels = np.arange(X_holdout.shape[0]) + X.shape[0]
        X = np.concatenate((X, X_holdout), axis = 0)
        y = np.concatenate((y, y_holdout), axis = 0)
        cv_custom = [(train_labels, test_labels)]
    estimators = [
            ('scale_predictors', StandardScaler()),
            #('feature_selector', LinearSVC(penalty='l1', dual=False)),
            #('feature_selector', SelectKBest(score_func=f_classif)),
            #('linearSVC', LinearSVC())
            ('randomforests', RandomForestClassifier())
            ]
    clf = Pipeline(estimators)
    params = dict(
            #linearSVC__C=[0.1, 1, 10],
            randomforests__max_depth=[5, 10, None], 
            randomforests__n_estimators=[10, 50, 100,1000,10000], 
            #feature_selector__C=[0.1, 1, 10]
            #feature_selector__score_func=[chi2],
            #feature_selector__k=[5, 10, 'all'] 
            )
    if(a != None):
        grid_search = GridSearchCV(clf, param_grid=params, cv=cv_custom, scoring = 'roc_auc', n_jobs = 6)
    else:
        grid_search = GridSearchCV(clf, param_grid=params, scoring = 'roc_auc', n_jobs = 6)
    grid_search.fit(X, y)
    return grid_search

In [73]:
def performance(results, param1, param2):
    param1_vals = [x.parameters[param1] for x in results.grid_scores_]
    param2_vals = [x.parameters[param2] for x in results.grid_scores_]
    means = [x.mean_validation_score for x in results.grid_scores_]
    df = pd.DataFrame(list(zip(param1_vals, param2_vals, means)), columns = [param1, param2, 'means'])
    df.fillna('None', inplace=True)
    return pd.pivot_table(df, values = 'means' , index = param1, columns = param2)

In [68]:
%matplotlib inline

%pylab

import matplotlib.pyplot as plt

plt.close('all')
pylab.rcParams['figure.figsize'] = (30.0, 30.0)

offset = 12*0
n = 24
cols = 3
myrange = finalDf_train.columns[list(range(6))+list(range(10, 14))+list(range(15, 20))+[37, 38, 42, 58, 63, 68, 69, 70, 76]]
fig, sbp = plt.subplots(n//cols, cols)
fig.tight_layout()
for i, l in enumerate(myrange[offset:offset+n]):
    #print(i,l)
    sbp[i //cols][i % cols].set_title('{}: '.format(i+offset)+l, size=30)
    finalDf_train[l].hist(ax=sbp[i //cols ][i % cols], bins = 30)

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [74]:
model_on_train = testing(X_train, y_train)
model_on_test = testing(X_train, y_train, X_holdout=X_test, y_holdout=y_test, a = 2)
model_on_train.best_score_
model_on_test.best_score_
a = list(model_on_train.param_grid)
train_results = performance(model_on_train, a[0],  a[1])
a = list(model_on_test.param_grid)
test_results = performance(model_on_test, a[0],  a[1])
print(train_results)
print(test_results)
# can only take 2 variables for our pivot table

randomforests__n_estimators     10        50        100       1000      10000
randomforests__max_depth                                                     
5.0                          0.828633  0.854014  0.870623  0.881781  0.879510
10.0                         0.852909  0.886942  0.887675  0.895702  0.894369
None                         0.846546  0.879259  0.881840  0.889541  0.889702
randomforests__n_estimators     10        50        100       1000      10000
randomforests__max_depth                                                     
5.0                          0.849572  0.886786  0.888438  0.885124  0.887149
10.0                         0.868093  0.900456  0.897179  0.909500  0.910132
None                         0.823304  0.900249  0.909848  0.910488  0.910219
