In [1]:
import numpy as np
import pandas as pd 
import random
import os
import warnings, gc
import time

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix

# **Read File**

In [2]:
TRAIN_PATH = './input/train.parquet'

In [3]:
train_org = pd.read_parquet(TRAIN_PATH)

In [4]:
train_org.head()
train_org = train_org.dropna(axis=1, thresh=int(0.90*len(train_org)))
train_org=train_org.set_index(['customer_ID'])
train_org=train_org.ffill().bfill()
train_org=train_org.reset_index()
train_org=train_org.groupby('customer_ID').tail(1)
train_org=train_org.set_index(['customer_ID'])
train_org.drop(['S_2'],axis=1,inplace=True)

train_org.head()

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,D_41,B_3,D_44,B_4,D_45,...,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-4532153018459703766,0.934745,0.009119,0.009382,1.007647,0.006104,0.001604,0.007174,0.003258,0.070793,0.740102,...,0.005702,0.00621,0.002715,0.007186,0.004234,0.005086,0.00581,0.00297,0.008533,0
-6696652885210834715,0.880519,0.178126,0.034684,1.004028,0.006911,0.005552,0.005068,0.008781,0.020626,0.266275,...,0.001928,0.002996,0.001701,0.00298,0.007479,0.00787,0.003284,0.003169,0.008514,0
7128959966677571777,0.880875,0.009704,0.004284,0.812649,0.00645,0.003796,0.007196,0.000628,0.031044,0.251598,...,0.00347,0.009881,0.007691,0.007383,0.006623,0.000964,0.002202,0.000834,0.003444,0
6537921148391624412,0.621776,0.001083,0.012564,1.006183,0.007829,0.004532,0.009937,0.007792,0.007235,0.085103,...,0.004576,0.001789,0.00514,0.002704,0.006184,0.001899,0.008183,0.00556,0.002983,0
2065103583825424365,0.8719,0.005573,0.007679,0.815746,0.001247,0.000231,0.005528,0.002436,0.269407,0.069952,...,0.008897,0.005045,0.003706,0.002974,0.004162,0.005764,0.008154,0.006944,0.000905,0


In [5]:
cols_used = train_org.columns

In [6]:
features = [x for x in train_org.columns.values if x not in ['customer_ID', 'target', 'S_2']]
X, y = train_org[features], train_org['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=100, stratify=y)

# **Evaluation Metric**

In [7]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [8]:
start_time = time.time()

In [9]:
tree, depth = 200, None

dtc = RandomForestClassifier(
    n_estimators = tree, random_state=0,
    max_depth = depth
)


dtc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=150, random_state=0)

In [10]:
print("Random forest with " + str(tree) + " trees and " + str(depth) +" depth take: " + 
      "--- %s seconds ---" % (time.time() - start_time))

Random forest with 150 trees and None depth take: --- 1571.8680748939514 seconds ---


In [11]:
preds = dtc.predict(X_test)
# Compute accuracy
accuracy = accuracy_score(y_test, preds)
print(f'accuracy: {accuracy: .2%}')

y_pred = pd.DataFrame(y_test.copy(deep=True))
y_pred = y_pred.rename(columns={'target':'prediction'})
# preds_prob = xg_cl.predict_proba(X_test)[:,1]
y_pred['prediction'] = dtc.predict_proba(X_test)[:,1]

accuracy:  89.14%


In [67]:
print('Metric Evaluation Values\n')
# print(f'Numpy: {amex_metric(y_test.to_frame(), preds_prob_df)}')
print(f': {amex_metric(y_test.to_frame(), y_pred)}')

Metric Evaluation Values

Numpy: 0.754704230400788
