In [72]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [73]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import sklearn
import json
from dateutil.relativedelta import relativedelta
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [74]:
from utils import db_utils
from utils import iefp_data_utils
from utils import pandas_utils
from utils import model_utils
from utils import train_test_utils
from utils import feature_utils


# Functions definitions

In [75]:
def model_metrics(pred,ltu_actual,cutoff_k):
    pred_dict = {'prob': pred,
                'actual': ltu_actual}
    pred_df = pd.DataFrame(pred_dict, index=ltu_actual.index)
    pred_df.sort_values('prob',ascending=False,inplace=True)
    pred_df['pred'] = (([True] * cutoff_k) + ([False] * (pred_df.shape[0]-cutoff_k)))

    true_positive = float(pred_df[(pred_df['pred']) & (pred_df['actual'])].shape[0])
    false_positive = float(pred_df[(pred_df['pred']) & (np.logical_not(pred_df['actual']))].shape[0])
    true_negative = float(pred_df[(np.logical_not(pred_df['pred'])) & (np.logical_not(pred_df['actual']))].shape[0])
    false_negative = float(pred_df[(np.logical_not(pred_df['pred'])) & (pred_df['actual'])].shape[0])
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    accuracy = (true_positive+true_negative)/(true_positive+true_negative+false_positive+false_negative)
    roc_auc_score = sklearn.metrics.roc_auc_score(pred_df['actual'], pred_df['pred'])
    ltu=true_positive+false_negative
    return (true_positive,true_negative,false_positive,false_negative,precision,recall,accuracy,roc_auc_score)

def get_confmatrix_label(predicted_label,actual_label):
    if predicted_label & actual_label:
        return 'TP'
    elif predicted_label & (not actual_label):
        return 'FP'
    elif (not predicted_label) & (not actual_label):
        return 'TN'
    else:
        return 'FN'

def evaluate_classification(pred,actual_label,cutoff_k):
    pred_dict = {'prob': pred,
                'actual': actual_label}
    pred_df = pd.DataFrame(pred_dict, index=actual_label.index)
    pred_df.sort_values('prob',ascending=False,inplace=True)
    pred_df['pred'] = (([True] * cutoff_k) + ([False] * (pred_df.shape[0]-cutoff_k)))
    pred_df['cm'] = pred_df.apply(lambda x: get_confmatrix_label(x['pred'],x['actual']),axis=1)
    
    return pred_df

# Pipeline Config

In [249]:
config_filepath = "../../src/model/pipeline-config-applength.json"
with open(config_filepath) as data_file:
    config = json.load(data_file)

# Running Pipeline

In [77]:
#Reading Data from DB
conn = db_utils.connect_to_db()
apps, movs = iefp_data_utils.get_clean_data(conn)

In [78]:
print "generating unchanging system and historical dataframes"
system_info = feature_utils.generate_system_info(apps)

generating unchanging system and historical dataframes


In [79]:
feature_set_list = config['feature_sets'][0]['list']
ltu_length = config['labels'][0]['ltu_length']
time_split = config['time_splits'][0]
model = config['models'][0]

In [80]:
split_type = time_split['type']
action_date = pd.to_datetime(time_split['action_date'])
train_timedelta = pd.Timedelta(time_split['train_timedelta'])
test_window_size = pd.Timedelta(time_split['test_window_size'])
train_st_date = action_date - train_timedelta
train_apps,test_apps,train_movs,test_movs = train_test_utils.split_train_test_apps(apps,movs,action_date,train_st_date,ltu_length) 

In [81]:
print "Generating LTU Labels"
train_labels = train_test_utils.get_ltu_label_on_date(train_apps,movs,action_date,ltu_length)
test_labels = train_test_utils.get_ltu_label_on_date(test_apps,movs,action_date + test_window_size,ltu_length)

Generating LTU Labels
Generating LTU/Non-LTU labels
Generating LTU/Non-LTU labels


In [82]:
print "Extending data"
extended_train = train_test_utils.extend_data(train_apps,movs,train_labels,action_date,pd.Timedelta('30D'))
extended_test = train_test_utils.extend_data(test_apps,movs,test_labels,action_date,None)

Extending data


In [83]:
print "Generating Features"
pd.options.mode.chained_assignment = None #turns off warning for chained assignment
train_matrix = feature_utils.generate_matrix(extended_train,train_apps,train_movs,feature_set_list,system_info)
test_matrix = feature_utils.generate_matrix(extended_test,test_apps,test_movs,feature_set_list, system_info)

Generating Features
Generating matrix base
Adding dynamics with app-length features
Generating matrix base
Adding dynamics with app-length features


In [84]:
model_id = model_utils.get_time()+'_'+model['type']+'_'+str(len(feature_set_list))+'fsets'
print "Training Model"
model_obj = model_utils.train_model(model['type'],train_matrix,{})

Training Model


In [85]:
print "Testing Model"
model_results = model_utils.test_model(model['type'],model_obj,test_matrix,model_id)

Testing Model


In [86]:
model_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ltu,2017-08-15_09:16:45_rf_1fsets
application_id,ref_date,Unnamed: 2_level_1,Unnamed: 3_level_1
100349,2016-04-30,True,1.0
100282,2016-04-30,True,1.0
100305,2016-04-30,True,1.0
100344,2016-04-30,True,1.0
100337,2016-04-30,True,1.0


# Model Evaluation

In [339]:
def prepare_model_results_for_analysis(model_res,applications,model_id):
    model_results_apps = model_res.reset_index().rename(columns = {model_id : 'prob'}).merge(applications, left_on='application_id',right_on='table_index')
    model_results_apps['months_in_system'] =  np.floor(pandas_utils.difftime_in_months(model_results_apps['ref_date'],model_results_apps['app_start_date']))
    model_results_apps = model_results_apps[['application_id','months_in_system','prob','ltu']]
    
    return model_results_apps

def build_test_eval_subset_df(subsets_config,model_res_apps):
    #Calculate month bucket LTU prop
    ltu_prop_by_month = model_res_apps.groupby(['months_in_system']).ltu.mean().reset_index(name='ltu_prop')
    ltu_prop_overall = model_res_apps.ltu.mean()
    
    #Filter months in test set based on subset type and value
    test_subsets = []
    
    for subset_config in subsets_config:
        subset_type = subset_config['type']
        subset_cutoff = subset_config['cutoff']
    
        if (subset_type == 'month_thresh'):
            test_subsets.append({'subset_type':subset_type, 'subset_cutoff': subset_cutoff, 'month_thresh':subset_cutoff})
        elif (subset_type == 'ltu_prop_weight'):
            month_thresh = ltu_prop_by_month[ltu_prop_by_month['ltu_prop'] < ltu_prop_overall*subset_cutoff]['months_in_system'].max()
            test_subsets.append({'subset_type':subset_type, 'subset_cutoff': subset_cutoff, 'month_thresh':month_thresh})
        else:
            print 'Warning: Invalid subset type:', subset_type
        
    return pd.DataFrame(test_subsets).reindex(columns = ['subset_type','subset_cutoff','month_thresh'])

def model_metrics(pred,ltu_actual,cutoff_k):
    pred_dict = {'prob': pred,
                'actual': ltu_actual}
    pred_df = pd.DataFrame(pred_dict, index=ltu_actual.index)
    pred_df.sort_values('prob',ascending=False,inplace=True)
    pred_df['pred'] = (([True] * cutoff_k) + ([False] * (pred_df.shape[0]-cutoff_k)))

    true_positive = float(pred_df[(pred_df['pred']) & (pred_df['actual'])].shape[0])
    false_positive = float(pred_df[(pred_df['pred']) & (np.logical_not(pred_df['actual']))].shape[0])
    true_negative = float(pred_df[(np.logical_not(pred_df['pred'])) & (np.logical_not(pred_df['actual']))].shape[0])
    false_negative = float(pred_df[(np.logical_not(pred_df['pred'])) & (pred_df['actual'])].shape[0])
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    accuracy = (true_positive+true_negative)/(true_positive+true_negative+false_positive+false_negative)
    roc_auc_score = sklearn.metrics.roc_auc_score(pred_df['actual'], pred_df['pred'])
    ltu=true_positive+false_negative
    return (true_positive,true_negative,false_positive,false_negative,precision,recall,accuracy,roc_auc_score)

def evaluate_model(model_results, k_cutoffs, test_subsets, model_id):
    
    model_performance = pd.DataFrame(columns=['model_id','k_value','k_type','metric','subset_type','subset_cutoff','value'])

    print "Calculating performance metrics for model: "+model_id

    model_performances_list = []
    
    for index, row in test_subsets.iterrows():
        subset_type = row['subset_type']
        subset_cutoff = row['subset_cutoff']
        month_threshold = row['month_thresh']
        
        for k in k_cutoffs:
            k_type = k['type']
            k_value = k['value']

            if k_type == "number":
                cutoff_number = k_value
            else:
                cutoff_number = 1 #placeholder; want to replace with function to convert a percentage to a number k
            model_results_subset = model_results[model_results['months_in_system'] <= month_threshold]
            subset_ltu_prop = model_results_subset.ltu.mean()
            subset_size = model_results_subset.shape[0]
            tp,tn,fp,fn,precision,recall,accuracy,roc_auc_score=model_metrics(model_results_subset.loc[:, 'prob'], model_results_subset.loc[:, 'ltu'], cutoff_number)
            
            model_performances_list.append({'model_id':model_id, 'k_value':k_value, 'k_type':k_type, 'metric':'precision', 'subset_type': subset_type, 'subset_cutoff': subset_cutoff, 'month_threshold': month_threshold,'subset_size': subset_size, 'subset_ltu_prop': subset_ltu_prop, 'value':precision})
            model_performances_list.append({'model_id':model_id, 'k_value':k_value, 'k_type':k_type, 'metric':'recall', 'subset_type': subset_type, 'subset_cutoff': subset_cutoff, 'month_threshold': month_threshold,'subset_size': subset_size, 'subset_ltu_prop': subset_ltu_prop, 'value':recall})
            model_performances_list.append({'model_id':model_id, 'k_value':k_value, 'k_type':k_type, 'metric':'accuracy', 'subset_type': subset_type, 'subset_cutoff': subset_cutoff, 'month_threshold': month_threshold,'subset_size': subset_size, 'subset_ltu_prop': subset_ltu_prop, 'value':accuracy})
            model_performances_list.append({'model_id':model_id, 'k_value':k_value, 'k_type':k_type, 'metric':'roc_auc_score', 'subset_type': subset_type, 'subset_cutoff': subset_cutoff, 'month_threshold': month_threshold,'subset_size': subset_size, 'subset_ltu_prop': subset_ltu_prop, 'value':roc_auc_score})

    model_performance = pd.DataFrame(model_performances_list)
    model_performance = model_performance.reindex(columns = ['model_id','k_value','k_type','metric','subset_type','subset_cutoff','month_threshold','subset_size','subset_ltu_prop','value'])
        
    return model_performance.reset_index(drop=True)

In [340]:
model_res_apps = prepare_model_results_for_analysis(model_results,apps,model_id)

In [341]:
model_res_apps.head()

Unnamed: 0,application_id,months_in_system,prob,ltu
0,100349,11.0,1.0,True
1,100282,11.0,1.0,True
2,100305,11.0,1.0,True
3,100344,11.0,1.0,True
4,100337,11.0,1.0,True


In [342]:
test_subsets = build_test_eval_subset_df(config['data_subsets'],model_res_apps)

In [343]:
test_subsets

Unnamed: 0,subset_type,subset_cutoff,month_thresh
0,month_thresh,4.0,4.0
1,month_thresh,6.0,6.0
2,month_thresh,12.0,12.0
3,ltu_prop_weight,1.5,9.0


In [344]:
model_performance = evaluate_model(model_results_apps, config['cutoffs'], test_subsets, model_id)

Calculating performance metrics for model: 2017-08-15_09:16:45_rf_1fsets


In [346]:
model_performance.head()

Unnamed: 0,model_id,k_value,k_type,metric,subset_type,subset_cutoff,month_threshold,subset_size,subset_ltu_prop,value
0,2017-08-15_09:16:45_rf_1fsets,50,number,precision,month_thresh,4.0,4.0,3863,0.391665,0.68
1,2017-08-15_09:16:45_rf_1fsets,50,number,recall,month_thresh,4.0,4.0,3863,0.391665,0.022472
2,2017-08-15_09:16:45_rf_1fsets,50,number,accuracy,month_thresh,4.0,4.0,3863,0.391665,0.612995
3,2017-08-15_09:16:45_rf_1fsets,50,number,roc_auc_score,month_thresh,4.0,4.0,3863,0.391665,0.507832
4,2017-08-15_09:16:45_rf_1fsets,100,number,precision,month_thresh,4.0,4.0,3863,0.391665,0.63


In [331]:
db_utils.write_table(model_performance,conn, 'model_output', 'performances2',if_exists='append')

In [246]:
month_subset_4 = build_test_eval_subset_df(config['data_subsets'],model_results,apps)
month_subset_6 = prepare_test_eval_subset('month_subset',6,model_results,apps)
month_subset_12 = prepare_test_eval_subset('month_subset',12,model_results,apps)

ltu_prop_subset_1_5 = prepare_test_eval_subset('ltu_prop',1.5,model_results,apps)

In [247]:
print(month_subset_4.describe())
print(month_subset_6.describe())
print(month_subset_12.describe())

print(ltu_prop_subset_1_2.describe())
print(ltu_prop_subset_1_3.describe())
print(ltu_prop_subset_1_5.describe())

       application_id  months_in_system         prob
count     3863.000000       3863.000000  3863.000000
mean    110910.451463          1.780999     0.317886
std       2072.393517          1.350406     0.130651
min      71223.000000          0.000000     0.000000
25%     109655.000000          1.000000     0.229770
50%     111076.000000          2.000000     0.277114
75%     112247.500000          3.000000     0.394026
max     123789.000000          4.000000     0.990952
       application_id  months_in_system         prob
count     4905.000000       4905.000000  4905.000000
mean    110087.320082          2.575535     0.379056
std       2509.440011          1.957036     0.180699
min      71223.000000          0.000000     0.000000
25%     108329.000000          1.000000     0.243987
50%     110411.000000          2.000000     0.305397
75%     111948.000000          4.000000     0.521276
max     124952.000000          6.000000     1.000000
       application_id  months_in_system       

In [222]:
model_results_apps = model_results.reset_index().rename(columns = {model_id : 'prob'}).merge(apps, left_on='application_id',right_on='table_index')
model_results_apps['months_in_system2'] =  model_results_apps.apply(lambda x: relativedelta(x['ref_date'],x['app_start_date']).months, axis=1)
model_results_apps['months_in_system'] =  np.floor(pandas_utils.difftime_in_months(model_results_apps['ref_date'],model_results_apps['app_start_date']))
model_results_apps = model_results_apps[['application_id','months_in_system','months_in_system2','prob','ltu']]

In [225]:
model_results_apps.describe()

Unnamed: 0,application_id,months_in_system,months_in_system2,prob
count,6899.0,6899.0,6899.0,6899.0
mean,108151.689375,4.358748,4.373242,0.508167
std,4004.302389,3.335434,3.332669,0.26853
min,70322.0,0.0,0.0,0.0
25%,105190.0,2.0,2.0,0.269502
50%,108847.0,4.0,4.0,0.455292
75%,111389.0,7.0,7.0,0.742563
max,124999.0,11.0,11.0,1.0


In [214]:
type(model_results_apps['months_in_system'][0])

numpy.float64

In [242]:
print evaluate_model(month_subset_4, 'number', 1000, model_id,'month_subset',4)
print evaluate_model(month_subset_6, 'number', 1000, model_id,'month_subset',6)
print evaluate_model(month_subset_12, 'number', 1000, model_id,'month_subset',12)
print evaluate_model(ltu_prop_subset_1_5, 'number', 1000, model_id,'ltu_prop',1.5)

Calculating performance metrics for model: 2017-08-15_09:16:45_rf_1fsets
                        model_id k_value  k_type         metric   subset_type  \
0  2017-08-15_09:16:45_rf_1fsets    1000  number      precision  month_subset   
1  2017-08-15_09:16:45_rf_1fsets    1000  number         recall  month_subset   
2  2017-08-15_09:16:45_rf_1fsets    1000  number       accuracy  month_subset   
3  2017-08-15_09:16:45_rf_1fsets    1000  number  roc_auc_score  month_subset   

  subset_cutoff     value  
0             4  0.553000  
1             4  0.365499  
2             4  0.635775  
3             4  0.587643  
Calculating performance metrics for model: 2017-08-15_09:16:45_rf_1fsets
                        model_id k_value  k_type         metric   subset_type  \
0  2017-08-15_09:16:45_rf_1fsets    1000  number      precision  month_subset   
1  2017-08-15_09:16:45_rf_1fsets    1000  number         recall  month_subset   
2  2017-08-15_09:16:45_rf_1fsets    1000  number       accuracy  

In [179]:
#Calculate month bucket LTU prop
ltu_prop_by_month = model_results_apps.groupby(['months_in_system']).ltu.mean().reset_index(name='ltu_prop')
ltu_prop_overall = model_results.ltu.mean()

In [180]:
print(ltu_prop_by_month)
print(ltu_prop_overall)

    months_in_system  ltu_prop
0                  0  0.317497
1                  1  0.326365
2                  2  0.352668
3                  3  0.480570
4                  4  0.567568
5                  5  0.579350
6                  6  0.614068
7                  7  0.716094
8                  8  0.800000
9                  9  0.805492
10                10  0.888571
11                11  0.980066
0.547760545007


In [187]:
ltu_prop_coef = 1.3
unbiased_months = ltu_prop_by_month[ltu_prop_by_month['ltu_prop'] < ltu_prop_overall*ltu_prop_coef]['months_in_system']
unbiased_set = model_results_apps[model_results_apps['months_in_system'].isin(unbiased_months)]

In [188]:
ltu_prop_overall*ltu_prop_coef

0.71208870850847961

In [189]:
unbiased_months

0    0
1    1
2    2
3    3
4    4
5    5
6    6
Name: months_in_system, dtype: int64

In [190]:
unbiased_set.head()

Unnamed: 0,application_id,months_in_system,prob,ltu
2011,105859,6,0.636924,True
2012,105860,6,0.71835,False
2013,105799,6,0.723506,True
2014,105812,6,0.688167,True
2015,105826,6,0.775273,True


In [191]:
unbiased_set = model_results_apps[model_results_apps['months_in_system'] <= 4]

In [192]:
unbiased_set.head()

Unnamed: 0,application_id,months_in_system,prob,ltu
3060,108004,4,0.602142,False
3061,108070,4,0.564362,True
3062,108002,4,0.554146,True
3063,108052,4,0.427,True
3064,108031,4,0.554146,True


In [239]:


def get_confmatrix_label(predicted_label,actual_label):
    if predicted_label & actual_label:
        return 'TP'
    elif predicted_label & (not actual_label):
        return 'FP'
    elif (not predicted_label) & (not actual_label):
        return 'TN'
    else:
        return 'FN'

def evaluate_classification(pred,actual_label,cutoff_k):
    pred_dict = {'prob': pred,
                'actual': actual_label}
    pred_df = pd.DataFrame(pred_dict, index=actual_label.index)
    pred_df.sort_values('prob',ascending=False,inplace=True)
    pred_df['pred'] = (([True] * cutoff_k) + ([False] * (pred_df.shape[0]-cutoff_k)))
    pred_df['cm'] = pred_df.apply(lambda x: get_confmatrix_label(x['pred'],x['actual']),axis=1)
    
    return pred_df

In [118]:
model_eval = evaluate_model(model_results, 'number', 1000)

Calculating performance metrics for model: 2017-08-15_09:16:45_rf_1fsets


In [119]:
model_eval.head()

Unnamed: 0,model_id,k_value,k_type,metric,value
0,2017-08-15_09:16:45_rf_1fsets,1000,number,precision,0.88
1,2017-08-15_09:16:45_rf_1fsets,1000,number,recall,0.232866
2,2017-08-15_09:16:45_rf_1fsets,1000,number,accuracy,0.5624
3,2017-08-15_09:16:45_rf_1fsets,1000,number,roc_auc_score,0.597202


In [70]:
res_subset = evaluate_model(model_results,apps,'all',0.0,model_id)

In [71]:
res_subset.head()

Unnamed: 0,application_id,ref_date,ltu,prob,anomes,mo_data_movimento,cfreguesia,dfreguesia,ctipo_movimento,dtipo_movimento,...,candidatura_data_ppe,ute_idade,intervencoes,sub_data_inicio,sub_data_fim,sub_data_extincao,sub_data_suspensao,table_index,app_start_date,months_since_app
0,100349,2016-04-30,True,1.0,201505,2015-05-04 16:53:22,110501,ALCABIDECHE,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,...,2013-01-14 00:00:00,53,1,2002-06-06 00:00:00,2004-05-26 00:00:00,2004-04-08 00:00:00,,100349,2015-05-04,11.893468
1,100282,2016-04-30,True,1.0,201505,2015-05-04 09:42:11,110507,U.F. DE CARCAVELOS E PAREDE,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,...,2009-04-06 00:00:00,42,1,,,,,100282,2015-05-04,11.893468
2,100305,2016-04-30,True,1.0,201505,2015-05-04 11:54:43,110501,ALCABIDECHE,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,...,2013-08-27 00:00:00,22,1,2013-08-27 00:00:00,2014-05-24 00:00:00,2014-01-23 00:00:00,2014-01-24 00:00:00,100305,2015-05-04,11.893468
3,100344,2016-04-30,True,1.0,201505,2015-05-04 16:18:30,110508,U.F. DE CASCAIS E ESTORIL,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,...,2009-11-04 00:00:00,50,1,,,,,100344,2015-05-04,11.893468
4,100337,2016-04-30,True,1.0,201505,2015-05-04 15:31:20,110506,SÃO DOMINGOS DE RANA,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,...,2013-12-27 00:00:00,22,1,,,,,100337,2015-05-04,11.893468


In [21]:
#Checking classes order in prediction result
print model_obj.classes_

[ 0.  1.]


In [45]:
classif_eval = evaluate_classification(model_results.loc[:,model_id], model_results.loc[:, 'ltu'], 2000).reset_index()

In [47]:
print classif_eval.shape
classif_eval.head()

(6899, 6)


Unnamed: 0,application_id,ref_date,actual,prob,pred,cm
0,100349,2016-04-30,True,1.0,True,TP
1,100960,2016-04-30,True,1.0,True,TP
2,100864,2016-04-30,True,1.0,True,TP
3,100887,2016-04-30,True,1.0,True,TP
4,103187,2016-04-30,True,1.0,True,TP


In [54]:
print test_matrix.shape
raw_test_matrix = test_matrix.reset_index()
raw_test_matrix.head()

(6899, 8)


Unnamed: 0,application_id,ref_date,ltu,apps_so_far,cat_changes_so_far,convocations_so_far,interventions_so_far,interviews_so_far,movements,months_so_far
0,100349,2016-04-30,True,1.0,0.0,5.0,3.0,2.0,11.0,12
1,100282,2016-04-30,True,1.0,0.0,2.0,5.0,0.0,8.0,12
2,100305,2016-04-30,True,1.0,0.0,0.0,0.0,0.0,1.0,12
3,100344,2016-04-30,True,1.0,0.0,2.0,1.0,0.0,4.0,12
4,100337,2016-04-30,True,1.0,2.0,4.0,2.0,0.0,9.0,12


In [59]:
test_results = classif_eval.merge(apps, left_on='application_id',right_on='table_index')[['application_id','app_start_date','ref_date','prob']].sort_values('prob', ascending=False)
test_results['months_since_app'] = pandas_utils.difftime_in_months(test_results['ref_date'],test_results['app_start_date']).astype(int)

In [60]:
test_results[500:550]

Unnamed: 0,application_id,app_start_date,ref_date,prob,months_since_app
518,101698,2015-06-11,2016-04-30,0.958229,10
519,101827,2015-06-15,2016-04-30,0.958229,10
520,102287,2015-06-12,2016-04-30,0.958229,10
521,101784,2015-06-12,2016-04-30,0.958229,10
522,101675,2015-06-09,2016-04-30,0.958229,10
523,101745,2015-06-11,2016-04-30,0.958229,10
509,102268,2015-06-29,2016-04-30,0.958229,10
510,102292,2015-06-29,2016-04-30,0.958229,10
494,102044,2015-06-19,2016-04-30,0.958229,10
508,102244,2015-06-29,2016-04-30,0.958229,10


In [None]:
applications_by_months_so_far = test_results.groupby(['months_so_far'])['application_id'].count().reset_index(name='total_count')
applications_by_months_so_far

In [None]:
cm_by_months_so_far = test_results.groupby(['cm','months_so_far'])['application_id'].count().reset_index(name='count')
cm_by_months_so_far

In [None]:
norm_cm_by_months_so_far = cm_by_months_so_far.merge(applications_by_months_so_far)
norm_cm_by_months_so_far.head()

In [None]:
norm_cm_by_months_so_far['proportion'] = norm_cm_by_months_so_far['count']/norm_cm_by_months_so_far['total_count']
norm_cm_by_months_so_far.head()

In [None]:
for cm_label in ['TP','FP','TN','FN']:
    cm_hist = norm_cm_by_months_so_far[norm_cm_by_months_so_far['cm'] ==  cm_label][['months_so_far','proportion']]
    cm_hist.plot(
        x='months_so_far',
        y='proportion',
        kind='bar',
        legend=False,
        title="Normalized Proportion of months_so_far in " + cm_label).set_ylabel('Proportion within group (%)')

In [None]:
applications_by_months_so_far.plot(
        x='months_so_far',
        y='total_count',
        kind='bar',
        legend=False,
        title="Proportion of months_so_far in test set").set_ylabel('Total Count')

In [None]:
test_matrix.time_so_far.describe()

In [None]:
test_matrix.months_so_far.describe()

In [None]:
classif_eval.groupby('cm').count()

In [None]:
#Analyze Distribution of months since application in TP and FN


In [None]:
classif_eval.shape

In [None]:
classif_eval.head()

# Testing Features Matrix

In [None]:
train_matrix.columns.to_series().groupby(train_matrix.dtypes).groups

In [None]:
test_matrix.head()

# Testing Extended Datasets

In [None]:
extended_train.head()

In [None]:
extended_test.head()

In [None]:
extended_train[['application_id','ltu']].drop_duplicates().ltu.sum()

In [None]:
extended_test[['application_id','ltu']].drop_duplicates().ltu.sum()

In [None]:
train_labels.groupby('ltu').count()/train_labels.shape[0]

In [None]:
test_labels.groupby('ltu').count()/test_labels.shape[0]

In [None]:
train_app_length = get_app_length(train_apps,movs,test_st_date)

In [None]:
train_app_length.describe()

In [None]:
test_app_length = get_app_length(test_apps,movs,test_st_date)

In [None]:
test_app_length.describe()

In [None]:
train_apps.shape

In [None]:
test_apps.shape