In [23]:
import numpy as np
import pandas as pd 
import random
import os
import warnings, gc
import time

from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix

# **Read File**

In [24]:
TRAIN_PATH = './input/train.parquet'

In [25]:
%%time
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_parquet(path, columns=usecols)
    else: df = pd.read_parquet(path)

    print('shape of data:', df.shape)
    print("The training data begins on {} and ends on {}.".format(df['S_2'].min(),df['S_2'].max()))

    df = df.sort_values(['customer_ID','S_2'])
    df_out = df.groupby(['customer_ID']).nth(-1).reset_index(drop=False)
    print("There are {:,.0f} customers in the training set and {} features.".format(df_out.shape[0],df_out.shape[1]))
    del df
    _ = gc.collect()
    return df_out

print('Reading train data...')

train_o = read_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 191)
The training data begins on 2017-03-01 00:00:00 and ends on 2018-03-31 00:00:00.
There are 458,913 customers in the training set and 191 features.
CPU times: user 46.4 s, sys: 3min 25s, total: 4min 11s
Wall time: 7min 47s


In [26]:
train_o.head()
train = train_o.dropna(axis=1, thresh=int(0.90*len(train_o)))
train = train.set_index(['customer_ID'])
train = train.ffill().bfill()
train = train.reset_index()
train = train.groupby('customer_ID').tail(1)
train = train.set_index(['customer_ID'])
train.drop(['S_2'],axis=1,inplace=True)

# train.head()

In [27]:
cols_used = train.columns

# **Evaluation Metric**

In [28]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [29]:
tree, depth = 200, None


dtc = RandomForestClassifier(
    n_estimators = tree, random_state=0,
    max_depth = depth
)


In [30]:
features = [x for x in train.columns.values if x not in ['customer_ID', 'target', 'S_2']]
X, y = train[features], train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=100, stratify=y)

In [None]:
%%time
K = 5
cv = KFold(n_splits=K, random_state=200, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
gini=[]
TRAIN_SUBSAMPLE = 1.0
    
for (train, test), i in zip(cv.split(X, y), range(K)):
    
    
    print('#'*25)
    print('### Fold',i+1)
    print('### Train size',len(train),'Valid size',len(test))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    
    dtc.fit(X.iloc[train], y.iloc[train])
    
    preds = dtc.predict(X_test)
    # Compute accuracy
    auc_score = accuracy_score(y_test, preds)

    y_pred = pd.DataFrame(y_test.copy(deep=True))
    y_pred = y_pred.rename(columns={'target':'prediction'})
    y_pred['prediction'] = dtc.predict_proba(X_test)[:,1]
    

    gini_score = amex_metric(y_test.to_frame(), y_pred)
    gini.append(gini_score)
    

    print("Validation Gini: {:.5f}, AUC: {:.4f} \n".format(gini_score,auc_score))



#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Validation Gini: 0.98480, AUC: 0.9788 

#########################
### Fold 2
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Validation Gini: 0.98417, AUC: 0.9794 

#########################
### Fold 3
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Validation Gini: 0.98482, AUC: 0.9793 

#########################
### Fold 4
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
Validation Gini: 0.98411, AUC: 0.9786 

#########################
### Fold 5
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
