In [8]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

Wall time: 17.7 s


In [4]:
merge = transaction.merge(idee, how='outer', on='TransactionID')
objects = merge.select_dtypes('object')
objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
objects_columns = objects.columns.to_list()
merge[objects_columns] = merge[objects_columns].fillna('Unknown')
merge[objects_columns] = merge[objects_columns].astype('category')

numbers = merge.select_dtypes('number')
numbers_columns = numbers.columns.to_list()
merge[numbers_columns] = merge[numbers_columns].fillna(numbers.mean())

In [5]:
%%time
X  = merge.drop(columns='isFraud')
y = merge['isFraud'].astype('int') 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

Wall time: 4.19 s


In [6]:
brf = BalancedRandomForestClassifier(sampling_strategy='auto',
                                     replacement=True,
                                     random_state=0,
                                     n_estimators=100,
                                     n_jobs=-1)

In [10]:
n_components = 500

hashing_enc = ce.HashingEncoder(n_components=n_components, max_process=6).fit(X_train, y_train)

X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))

In [14]:
brf.fit(X_train_hashing, y_train)
y_proba = brf.predict_proba(X_test_hashing)[:, 1]
score = roc_auc_score(y_test, y_proba)
score

0.9230775880815327

In [15]:
y_pred = brf.predict(X_test_hashing)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.87      0.93    170910
           1       0.19      0.82      0.30      6252

    accuracy                           0.87    177162
   macro avg       0.59      0.85      0.62    177162
weighted avg       0.96      0.87      0.90    177162



In [16]:
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()

print(f'tn: {tn}')
print(f'fp: {fp}')
print(f'fn: {fn}')
print(f'tp: {tp}')

tn: 148574
fp: 22336
fn: 1110
tp: 5142
