In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds

In [2]:
import sys
import pickle
from utils import model_utils
from utils import db_utils
import prepare_feature_matrix
import train_model
import test_model
import evaluate_models

  from pandas.core import datetools


In [3]:
conn = db_utils.connect_to_db()

In [4]:
config={
    "models":[
        {"type":"rf"} 
    ],
    "time_splits":[
        {"type":"action","split_date":"2016-04-30","train_timedelta":"365D","test_timedelta":"183D"},
        {"type":"application","split_date":"2016-04-30","train_timedelta":"365D","test_timedelta":"365D"}
    ],
    "feature_sets":[
        {"name":"demographics_only"}
    ],
    "cutoffs":[
        {"type":"number", "value":1000}
    ]        
}

In [5]:
#Run pipeline
for time_split in config['time_splits']:
    split_type = time_split['type']
    split_date = pd.to_datetime(time_split['split_date'])
    train_timedelta = pd.Timedelta(time_split['train_timedelta'])
    test_timedelta = pd.Timedelta(time_split['test_timedelta'])

    for feature_set in config['feature_sets']:
        feature_set_name = feature_set['name']

        print "Generating Feature Matrix"
        feature_matrix = prepare_feature_matrix.generate_matrix(feature_set_name)
        
        print "Splitting Train/Test Sets"
        train_data,test_data=model_utils.split_train_test(feature_matrix,split_type,split_date,train_timedelta,test_timedelta)

        for model in config['models']:
            model_id = model_utils.get_time()+'_'+model['type']+'_'+split_type
    
            print "Training Model"
            model_obj = model_utils.train_model(model['type'],train_data,{})
            print "Testing Model"
            model_results = model_utils.test_model(model['type'],model_obj,test_data,model_id)
                 
            
            for k in config['cutoffs']: 
                print "Evaluating Model Performance"
                model_performance = evaluate_models.evaluate_models(model_results, k["type"], k["value"])
            
                print "Appending performance metrics to db and rewriting performances.csv"
#                model_performance.to_csv('/mnt/data/shared/model_output/performances.csv',mode='w',header=False, index=False)
#                db_utils.write_table(model_performance,conn, 'model_output', 'performances', if_exists='append')
               
           
            print "Getting config details and rewriting config.csv"
            model_config = pd.DataFrame({'model_id':model_id, 'split_type':split_type, 'model_type':model['type'], 'feature_set':feature_set_name, 'split_date':split_date}, index=[0])
            model_config = model_config.reindex(columns=['model_id', 'split_type', 'model_type','feature_set','split_date'])
#            model_config.to_csv('/mnt/data/shared/model_output/configs.csv',mode='w',header=False, index=False)
           
            print "Getting feature importances and rewriting feature_importances.csv"
            feature_importance = model_utils.get_feature_importances(model_obj, model['type'], model_id, train_data, top_n=10)
#            feature_importance.to_csv('/mnt/data/shared/model_output/feature_importances.csv',mode='w',header=False,index=False)
            
            print "Appending model config and feature importances to db tables"
#            db_utils.write_table(model_config,conn, 'model_output', 'configs',if_exists='append')
#            db_utils.write_table(feature_importance,conn,'model_output','feature_importances',if_exists='append')

Generating Feature Matrix
Reading data from DB
Cleaning data
Generating LTU/Non-LTU labels
Building model feature matrix
Splitting Train/Test Sets
Training Model
Testing Model
Evaluating Model Performance
Calculating performance metrics for model: 2017-07-28_17:22:13_rf_action
Appending performance metrics to db and rewriting performances.csv
Getting config details and rewriting config.csv
Getting feature importances and rewriting feature_importances.csv
Appending model config and feature importances to db tables
Generating Feature Matrix
Reading data from DB
Cleaning data
Generating LTU/Non-LTU labels
Building model feature matrix
Splitting Train/Test Sets
Training Model
Testing Model
Evaluating Model Performance
Calculating performance metrics for model: 2017-07-28_17:22:51_rf_application
Appending performance metrics to db and rewriting performances.csv
Getting config details and rewriting config.csv
Getting feature importances and rewriting feature_importances.csv
Appending model c

In [8]:
#Generate actual and predicted dataframe
def ky_evaluate_model(pred,ltu_actual,cutoff_k):
    pred_dict = {'prob': pred,
                'actual': ltu_actual}
    pred_df = pd.DataFrame(pred_dict, index=ltu_actual.index)
    pred_df.sort_values('prob',ascending=False,inplace=True)
    pred_df['pred'] = (([True] * cutoff_k) + ([False] * (pred_df.shape[0]-cutoff_k)))
    return pred_df

actual_pred = ky_evaluate_model(model_results.loc[:,'2017-07-28_17:22:51_rf_application'],\
              model_results.loc[:,'ltu'],1000)
actual_pred.head()

Unnamed: 0,actual,prob,pred
398478,True,0.89,True
127075,True,0.84,True
548005,True,0.83,True
853369,True,0.82,True
497005,False,0.81,True


In [9]:
#compare to what metrics give
import sklearn.metrics
print sklearn.metrics.accuracy_score(actual_pred.loc[:,'actual'],\
                                actual_pred.loc[:,'pred'])
print sklearn.metrics.precision_score(actual_pred.loc[:,'actual'],\
                                actual_pred.loc[:,'pred'])
print sklearn.metrics.recall_score(actual_pred.loc[:,'actual'],\
                                actual_pred.loc[:,'pred'])
print sklearn.metrics.roc_auc_score(actual_pred.loc[:,'actual'],\
                                actual_pred.loc[:,'pred'])

0.620229714709
0.496
0.242899118511
0.546360167122


In [10]:
model_results.head()

Unnamed: 0,ltu,2017-07-28_17:22:51_rf_application
0,True,0.43
358,False,0.41
1338,False,0.41
1826,False,0.23
2160,False,0.21


In [11]:
model_config

Unnamed: 0,model_id,split_type,model_type,feature_set,split_date
0,2017-07-28_17:22:51_rf_application,application,rf,demographics_only,2016-04-30


In [12]:
model_performance

Unnamed: 0,model_id,k_value,k_type,metric,value
0,2017-07-28_17:22:51_rf_application,1000,number,precision,0.496
1,2017-07-28_17:22:51_rf_application,1000,number,recall,0.242899
2,2017-07-28_17:22:51_rf_application,1000,number,accuracy,0.62023
3,2017-07-28_17:22:51_rf_application,1000,number,roc_auc_score,0.54636


In [13]:
model_obj.feature_importances_

array([ 0.11783986,  0.16749163,  0.04396353,  0.10067518,  0.10083943,
        0.01712462,  0.0170005 ,  0.00053832,  0.00053117,  0.0085803 ,
        0.00795006,  0.02120158,  0.02050117,  0.0205954 ,  0.02065509,
        0.00905354,  0.00857765,  0.01562002,  0.00329237,  0.0029707 ,
        0.01531648,  0.01303159,  0.00221121,  0.00202134,  0.00506839,
        0.0052045 ,  0.01495949,  0.01330893,  0.01539374,  0.00282139,
        0.00280188,  0.0027757 ,  0.00290221,  0.00798251,  0.00739816,
        0.0079017 ,  0.00665612,  0.00968996,  0.00411646,  0.0028048 ,
        0.00748644,  0.00996631,  0.00399704,  0.01040587,  0.01260446,
        0.00659549,  0.00626674,  0.00971677,  0.00338807,  0.00257991,
        0.00548824,  0.00820947,  0.00451697,  0.00925854,  0.01301025,
        0.00902596,  0.00104405,  0.00397511,  0.00248192,  0.00387061,
        0.00238239,  0.00273923,  0.0027618 ,  0.00615983,  0.00269987])