In [1]:
from collections import ChainMap
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from util import load_data, fdr, plot_report
pd.set_option('display.max_rows', 100)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
739938,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
419058,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
166467,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,-2.091859,0.123528,1.847681,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,1.496715,0.171472,0.171458,-0.072421
450056,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,-1.100816,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
449847,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [3]:
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8).fit(x_train, y_train)

In [4]:
def generate_report(clf, x, y):
    y_prob = clf.predict_proba(x)[:, -1:]
    num_total_fraud = len(y[y==1])
    num_total_record = len(y)
    num_records = int(0.01 * num_total_record)

    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))

    stat = defaultdict(list)
    for i in range(20):
        cutoff_bin = sorted_prob[i*num_records: (i+1)*num_records, 1:]
        stat['records'].append(num_records)
        stat['bads'].append(len(cutoff_bin[cutoff_bin==1]))
        if i == 0:
            stat['total_records'].append(stat['records'][0])
            stat['total_bads'].append(stat['bads'][0])
        else:
            stat['total_records'].append(stat['records'][i] + stat['total_records'][i-1])
            stat['total_bads'].append(stat['bads'][i] + stat['total_bads'][i-1])

    stat = pd.DataFrame(stat)
    stat['percentile_bin'] = range(1, 21)
    stat.set_index('percentile_bin', inplace=True)
    stat['goods'] = stat.records - stat.bads
    stat['percent_goods'] = (stat.goods / stat.records * 100).round(2)
    stat['percent_bads'] = 100 - stat.percent_goods

    stat['total_goods'] = stat.total_records - stat.total_bads
    stat['percent_total_bads'] = (stat.total_bads / num_total_fraud * 100).round(2)
    stat['percent_total_goods'] = (stat.total_goods / (num_total_record - num_total_fraud) * 100).round(2)
    
    stat['KS'] = (stat.percent_total_bads - stat.percent_total_goods).round(2)
    stat['FPR'] = (stat.total_goods / stat.total_bads).round(4)
    
    cols = ['records', 'goods', 'bads', 'percent_goods', 'percent_bads',
            'total_records', 'total_goods', 'total_bads',
            'percent_total_goods', 'percent_total_bads', 'KS', 'FPR']
    return stat[cols]

In [5]:
generate_report(rf, x_train, y_train)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,6800,1642,5158,24.15,75.85,6800,1642,5158,0.24,52.68,52.44,0.3183
2,6800,6627,173,97.46,2.54,13600,8269,5331,1.23,54.44,53.21,1.5511
3,6800,6721,79,98.84,1.16,20400,14990,5410,2.24,55.25,53.01,2.7708
4,6800,6731,69,98.99,1.01,27200,21721,5479,3.24,55.95,52.71,3.9644
5,6800,6747,53,99.22,0.78,34000,28468,5532,4.25,56.5,52.25,5.1461
6,6800,6746,54,99.21,0.79,40800,35214,5586,5.25,57.05,51.8,6.304
7,6800,6743,57,99.16,0.84,47600,41957,5643,6.26,57.63,51.37,7.4352
8,6800,6733,67,99.01,0.99,54400,48690,5710,7.26,58.31,51.05,8.5271
9,6800,6741,59,99.13,0.87,61200,55431,5769,8.27,58.92,50.65,9.6084
10,6800,6744,56,99.18,0.82,68000,62175,5825,9.28,59.49,50.21,10.6738


In [6]:
generate_report(rf, x_test, y_test)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1700,399,1301,23.47,76.53,1700,399,1301,0.24,53.3,53.06,0.3067
2,1700,1654,46,97.29,2.71,3400,2053,1347,1.23,55.18,53.95,1.5241
3,1700,1681,19,98.88,1.12,5100,3734,1366,2.23,55.96,53.73,2.7335
4,1700,1685,15,99.12,0.88,6800,5419,1381,3.23,56.58,53.35,3.924
5,1700,1685,15,99.12,0.88,8500,7104,1396,4.24,57.19,52.95,5.0888
6,1700,1690,10,99.41,0.59,10200,8794,1406,5.25,57.6,52.35,6.2546
7,1700,1686,14,99.18,0.82,11900,10480,1420,6.25,58.17,51.92,7.3803
8,1700,1688,12,99.29,0.71,13600,12168,1432,7.26,58.66,51.4,8.4972
9,1700,1685,15,99.12,0.88,15300,13853,1447,8.27,59.28,51.01,9.5736
10,1700,1682,18,98.94,1.06,17000,15535,1465,9.27,60.02,50.75,10.6041


In [7]:
generate_report(rf, x_oot, y_oot)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1500,408,1092,27.2,72.8,1500,408,1092,0.28,50.56,50.28,0.3736
2,1500,1461,39,97.4,2.6,3000,1869,1131,1.26,52.36,51.1,1.6525
3,1500,1477,23,98.47,1.53,4500,3346,1154,2.26,53.43,51.17,2.8995
4,1500,1486,14,99.07,0.93,6000,4832,1168,3.27,54.07,50.8,4.137
5,1500,1483,17,98.87,1.13,7500,6315,1185,4.27,54.86,50.59,5.3291
6,1500,1490,10,99.33,0.67,9000,7805,1195,5.28,55.32,50.04,6.5314
7,1500,1486,14,99.07,0.93,10500,9291,1209,6.28,55.97,49.69,7.6849
8,1500,1486,14,99.07,0.93,12000,10777,1223,7.29,56.62,49.33,8.8119
9,1500,1488,12,99.2,0.8,13500,12265,1235,8.3,57.18,48.88,9.9312
10,1500,1487,13,99.13,0.87,15000,13752,1248,9.3,57.78,48.48,11.0192
