In [1]:
from collections import ChainMap
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from util import load_data, fdr, plot_report
pd.set_option('display.max_rows', 100)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
722292,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,-0.66118,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
313122,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
698952,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,-1.533,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
739743,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,-1.309456,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
594226,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [3]:
rf = RandomForestClassifier(n_estimators=150,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8).fit(x_train, y_train)

In [8]:
def generate_report(clf, x, y):
    y_prob = clf.predict_proba(x)[:, -1:]
    num_total_fraud = len(y[y==1])
    num_total_record = len(y)
    num_records = int(0.01 * num_total_record)

    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))

    stat = defaultdict(list)
    for i in range(20):
        cutoff_bin = sorted_prob[i*num_records: (i+1)*num_records, 1:]
        stat['records'].append(num_records)
        stat['bads'].append(len(cutoff_bin[cutoff_bin==1]))
        if i == 0:
            stat['total_records'].append(stat['records'][0])
            stat['total_bads'].append(stat['bads'][0])
        else:
            stat['total_records'].append(stat['records'][i] + stat['total_records'][i-1])
            stat['total_bads'].append(stat['bads'][i] + stat['total_bads'][i-1])

    stat = pd.DataFrame(stat)
    stat['percentile_bin'] = range(1, 21)
    stat.set_index('percentile_bin', inplace=True)
    stat['goods'] = stat.records - stat.bads
    stat['percent_goods'] = (stat.goods / stat.records * 100).round(2)
    stat['percent_bads'] = 100 - stat.percent_goods

    stat['total_goods'] = stat.total_records - stat.total_bads
    stat['percent_total_bads'] = (stat.total_bads / num_total_fraud * 100).round(2)
    stat['percent_total_goods'] = (stat.total_goods / (num_total_record - num_total_fraud) * 100).round(2)
    
    stat['FPR'] = (stat.total_goods / stat.total_bads).round(4)
    
    cols = ['records', 'goods', 'bads', 'percent_goods', 'percent_bads',
            'total_records', 'total_goods', 'total_bads',
            'percent_total_goods', 'percent_total_bads', 'FPR']
    return stat[cols]

In [9]:
generate_report(rf, x_train, y_train)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,6400,1573,4827,24.58,75.42,6400,1573,4827,0.25,52.77,0.3259
2,6400,6239,161,97.48,2.52,12800,7812,4988,1.24,54.53,1.5662
3,6400,6331,69,98.92,1.08,19200,14143,5057,2.24,55.28,2.7967
4,6400,6341,59,99.08,0.92,25600,20484,5116,3.25,55.92,4.0039
5,6400,6348,52,99.19,0.81,32000,26832,5168,4.25,56.49,5.192
6,6400,6348,52,99.19,0.81,38400,33180,5220,5.26,57.06,6.3563
7,6400,6350,50,99.22,0.78,44800,39530,5270,6.27,57.61,7.5009
8,6400,6349,51,99.2,0.8,51200,45879,5321,7.27,58.17,8.6223
9,6400,6347,53,99.17,0.83,57600,52226,5374,8.28,58.75,9.7183
10,6400,6354,46,99.28,0.72,64000,58580,5420,9.29,59.25,10.8081


In [10]:
generate_report(rf, x_test, y_test)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1600,380,1220,23.75,76.25,1600,380,1220,0.24,52.5,0.3115
2,1600,1543,57,96.44,3.56,3200,1923,1277,1.22,54.95,1.5059
3,1600,1582,18,98.88,1.12,4800,3505,1295,2.22,55.72,2.7066
4,1600,1583,17,98.94,1.06,6400,5088,1312,3.23,56.45,3.878
5,1600,1591,9,99.44,0.56,8000,6679,1321,4.24,56.84,5.056
6,1600,1584,16,99.0,1.0,9600,8263,1337,5.24,57.53,6.1803
7,1600,1581,19,98.81,1.19,11200,9844,1356,6.24,58.35,7.2596
8,1600,1588,12,99.25,0.75,12800,11432,1368,7.25,58.86,8.3567
9,1600,1587,13,99.19,0.81,14400,13019,1381,8.26,59.42,9.4272
10,1600,1591,9,99.44,0.56,16000,14610,1390,9.27,59.81,10.5108


In [11]:
generate_report(rf, x_oot, y_oot)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2000,508,1492,25.4,74.6,2000,508,1492,0.26,51.08,0.3405
2,2000,1941,59,97.05,2.95,4000,2449,1551,1.24,53.1,1.579
3,2000,1974,26,98.7,1.3,6000,4423,1577,2.24,53.99,2.8047
4,2000,1980,20,99.0,1.0,8000,6403,1597,3.25,54.67,4.0094
5,2000,1980,20,99.0,1.0,10000,8383,1617,4.25,55.36,5.1843
6,2000,1985,15,99.25,0.75,12000,10368,1632,5.26,55.87,6.3529
7,2000,1987,13,99.35,0.65,14000,12355,1645,6.27,56.32,7.5106
8,2000,1982,18,99.1,0.9,16000,14337,1663,7.27,56.93,8.6212
9,2000,1987,13,99.35,0.65,18000,16324,1676,8.28,57.38,9.7399
10,2000,1986,14,99.3,0.7,20000,18310,1690,9.29,57.86,10.8343
