In [1]:
from collections import ChainMap
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from util import load_data, fdr, plot_report
pd.set_option('display.max_rows', 100)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv', test_size=0.2, oot_size=0.15)
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
367057,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
264836,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
19124,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
772555,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
71207,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [3]:
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8).fit(x_train, y_train)

In [4]:
fdr(rf, x_oot, y_oot)

0.5351851851851852

In [5]:
def generate_report(clf, x, y):
    y_prob = clf.predict_proba(x)[:, -1:]
    num_total_fraud = len(y[y==1])
    num_total_record = len(y)
    num_records = int(0.01 * num_total_record)
    
    print(f"total num of records: {num_total_record}")
    print(f"total num of frauds: {num_total_fraud}")
    print(f"total num of goods: {num_total_record - num_total_fraud}")
    print(f"Fraud rate: {num_total_fraud / num_total_record}")

    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))

    stat = defaultdict(list)
    for i in range(20):
        cutoff_bin = sorted_prob[i*num_records: (i+1)*num_records, 1:]
        stat['records'].append(num_records)
        stat['bads'].append(len(cutoff_bin[cutoff_bin==1]))
        if i == 0:
            stat['total_records'].append(stat['records'][0])
            stat['total_bads'].append(stat['bads'][0])
        else:
            stat['total_records'].append(stat['records'][i] + stat['total_records'][i-1])
            stat['total_bads'].append(stat['bads'][i] + stat['total_bads'][i-1])

    stat = pd.DataFrame(stat)
    stat['percentile_bin'] = range(1, 21)
    stat.set_index('percentile_bin', inplace=True)
    stat['goods'] = stat.records - stat.bads
    stat['percent_goods'] = (stat.goods / stat.records * 100).round(2)
    stat['percent_bads'] = 100 - stat.percent_goods

    stat['total_goods'] = stat.total_records - stat.total_bads
    stat['percent_total_bads'] = (stat.total_bads / num_total_fraud * 100).round(2)
    stat['percent_total_goods'] = (stat.total_goods / (num_total_record - num_total_fraud) * 100).round(2)
    
    stat['KS'] = (stat.percent_total_bads - stat.percent_total_goods).round(2)
    stat['FPR'] = (stat.total_goods / stat.total_bads).round(4)
    
    cols = ['records', 'goods', 'bads', 'percent_goods', 'percent_bads',
            'total_records', 'total_goods', 'total_bads',
            'percent_total_goods', 'percent_total_bads', 'KS', 'FPR']
    return stat[cols]

In [6]:
generate_report(rf, x_train, y_train)

total num of records: 680000
total num of frauds: 9770
total num of goods: 670230
Fraud rate: 0.01436764705882353


Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,6800,1599,5201,23.51,76.49,6800,1599,5201,0.24,53.23,52.99,0.3074
2,6800,6637,163,97.6,2.4,13600,8236,5364,1.23,54.9,53.67,1.5354
3,6800,6725,75,98.9,1.1,20400,14961,5439,2.23,55.67,53.44,2.7507
4,6800,6736,64,99.06,0.94,27200,21697,5503,3.24,56.33,53.09,3.9428
5,6800,6749,51,99.25,0.75,34000,28446,5554,4.24,56.85,52.61,5.1217
6,6800,6742,58,99.15,0.85,40800,35188,5612,5.25,57.44,52.19,6.2701
7,6800,6744,56,99.18,0.82,47600,41932,5668,6.26,58.01,51.75,7.398
8,6800,6747,53,99.22,0.78,54400,48679,5721,7.26,58.56,51.3,8.5088
9,6800,6741,59,99.13,0.87,61200,55420,5780,8.27,59.16,50.89,9.5882
10,6800,6745,55,99.19,0.81,68000,62165,5835,9.28,59.72,50.44,10.6538


In [7]:
generate_report(rf, x_test, y_test)

total num of records: 170000
total num of frauds: 2463
total num of goods: 167537
Fraud rate: 0.014488235294117647


Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1700,438,1262,25.76,74.24,1700,438,1262,0.26,51.24,50.98,0.3471
2,1700,1650,50,97.06,2.94,3400,2088,1312,1.25,53.27,52.02,1.5915
3,1700,1680,20,98.82,1.18,5100,3768,1332,2.25,54.08,51.83,2.8288
4,1700,1685,15,99.12,0.88,6800,5453,1347,3.25,54.69,51.44,4.0483
5,1700,1673,27,98.41,1.59,8500,7126,1374,4.25,55.79,51.54,5.1863
6,1700,1690,10,99.41,0.59,10200,8816,1384,5.26,56.19,50.93,6.3699
7,1700,1685,15,99.12,0.88,11900,10501,1399,6.27,56.8,50.53,7.5061
8,1700,1693,7,99.59,0.41,13600,12194,1406,7.28,57.08,49.8,8.6728
9,1700,1692,8,99.53,0.47,15300,13886,1414,8.29,57.41,49.12,9.8204
10,1700,1691,9,99.47,0.53,17000,15577,1423,9.3,57.78,48.48,10.9466


In [8]:
generate_report(rf, x_oot, y_oot)

total num of records: 150000
total num of frauds: 2160
total num of goods: 147840
Fraud rate: 0.0144


Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads,KS,FPR
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1500,407,1093,27.13,72.87,1500,407,1093,0.28,50.6,50.32,0.3724
2,1500,1459,41,97.27,2.73,3000,1866,1134,1.26,52.5,51.24,1.6455
3,1500,1478,22,98.53,1.47,4500,3344,1156,2.26,53.52,51.26,2.8927
4,1500,1486,14,99.07,0.93,6000,4830,1170,3.27,54.17,50.9,4.1282
5,1500,1486,14,99.07,0.93,7500,6316,1184,4.27,54.81,50.54,5.3345
6,1500,1485,15,99.0,1.0,9000,7801,1199,5.28,55.51,50.23,6.5063
7,1500,1490,10,99.33,0.67,10500,9291,1209,6.28,55.97,49.69,7.6849
8,1500,1489,11,99.27,0.73,12000,10780,1220,7.29,56.48,49.19,8.8361
9,1500,1488,12,99.2,0.8,13500,12268,1232,8.3,57.04,48.74,9.9578
10,1500,1490,10,99.33,0.67,15000,13758,1242,9.31,57.5,48.19,11.0773
