In [1]:
from collections import ChainMap
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from util import load_data, fdr, plot_report
pd.set_option('display.max_rows', 100)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
581748,-0.047591,-0.041351,-0.042158,-0.046415,1.282153,-0.055801,-1.756544,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
72180,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
212705,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,-1.845961,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
644881,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,-1.875767,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
578692,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,-1.920476,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [3]:
rf = RandomForestClassifier(n_estimators=150,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8).fit(x_train, y_train)

In [4]:
def generate_report(clf, x, y):
    y_prob = clf.predict_proba(x)[:, -1:]
    num_total_fraud = len(y[y==1])
    num_total_record = len(y)
    num_records = int(0.01 * num_total_record)

    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))

    stat = defaultdict(list)
    for i in range(20):
        cutoff_bin = sorted_prob[i*num_records: (i+1)*num_records, 1:]
        stat['records'].append(num_records)
        stat['bads'].append(len(cutoff_bin[cutoff_bin==1]))
        if i == 0:
            stat['total_records'].append(stat['records'][0])
            stat['total_bads'].append(stat['bads'][0])
        else:
            stat['total_records'].append(stat['records'][i] + stat['total_records'][i-1])
            stat['total_bads'].append(stat['bads'][i] + stat['total_bads'][i-1])

    stat = pd.DataFrame(stat)
    stat['percentile_bin'] = range(1, 21)
    stat.set_index('percentile_bin', inplace=True)
    stat['goods'] = stat.records - stat.bads
    stat['percent_goods'] = (stat.goods / stat.records * 100).round(2)
    stat['percent_bads'] = 100 - stat.percent_goods

    stat['total_goods'] = stat.total_records - stat.total_bads
    stat['percent_total_bads'] = (stat.total_bads / num_total_fraud * 100).round(2)
    stat['percent_total_goods'] = (stat.total_goods / (num_total_record - num_total_fraud) * 100).round(2)
    
    cols = ['records', 'goods', 'bads', 'percent_goods', 'percent_bads',
            'total_records', 'total_goods', 'total_bads',
            'percent_total_goods', 'percent_total_bads']
    return stat[cols]

In [5]:
generate_report(rf, x_train, y_train)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,6400,1541,4859,24.08,75.92,6400,1541,4859,0.24,52.87
2,6400,6228,172,97.31,2.69,12800,7769,5031,1.23,54.74
3,6400,6326,74,98.84,1.16,19200,14095,5105,2.23,55.54
4,6400,6334,66,98.97,1.03,25600,20429,5171,3.24,56.26
5,6400,6347,53,99.17,0.83,32000,26776,5224,4.24,56.84
6,6400,6349,51,99.2,0.8,38400,33125,5275,5.25,57.39
7,6400,6351,49,99.23,0.77,44800,39476,5324,6.26,57.93
8,6400,6350,50,99.22,0.78,51200,45826,5374,7.26,58.47
9,6400,6343,57,99.11,0.89,57600,52169,5431,8.27,59.09
10,6400,6348,52,99.19,0.81,64000,58517,5483,9.28,59.66


In [6]:
generate_report(rf, x_test, y_test)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1600,408,1192,25.5,74.5,1600,408,1192,0.26,52.26
2,1600,1565,35,97.81,2.19,3200,1973,1227,1.25,53.79
3,1600,1579,21,98.69,1.31,4800,3552,1248,2.25,54.71
4,1600,1589,11,99.31,0.69,6400,5141,1259,3.26,55.2
5,1600,1586,14,99.12,0.88,8000,6727,1273,4.27,55.81
6,1600,1589,11,99.31,0.69,9600,8316,1284,5.27,56.29
7,1600,1580,20,98.75,1.25,11200,9896,1304,6.27,57.17
8,1600,1589,11,99.31,0.69,12800,11485,1315,7.28,57.65
9,1600,1592,8,99.5,0.5,14400,13077,1323,8.29,58.0
10,1600,1594,6,99.62,0.38,16000,14671,1329,9.3,58.26


In [7]:
generate_report(rf, x_oot, y_oot)

Unnamed: 0_level_0,records,goods,bads,percent_goods,percent_bads,total_records,total_goods,total_bads,percent_total_goods,percent_total_bads
percentile_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2000,505,1495,25.25,74.75,2000,505,1495,0.26,51.18
2,2000,1944,56,97.2,2.8,4000,2449,1551,1.24,53.1
3,2000,1975,25,98.75,1.25,6000,4424,1576,2.24,53.95
4,2000,1981,19,99.05,0.95,8000,6405,1595,3.25,54.6
5,2000,1984,16,99.2,0.8,10000,8389,1611,4.26,55.15
6,2000,1984,16,99.2,0.8,12000,10373,1627,5.26,55.7
7,2000,1982,18,99.1,0.9,14000,12355,1645,6.27,56.32
8,2000,1983,17,99.15,0.85,16000,14338,1662,7.28,56.9
9,2000,1985,15,99.25,0.75,18000,16323,1677,8.28,57.41
10,2000,1984,16,99.2,0.8,20000,18307,1693,9.29,57.96
