In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV
import category_encoders as ce

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

Wall time: 17.6 s


In [4]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('category')
    #print(objects.info())
    return objects
objects = pipeline(idee,transaction)

Wall time: 15.7 s


In [5]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

columns = X_train.columns.to_list()

Wall time: 217 ms


In [6]:
hashing_enc = ce.HashingEncoder(cols=columns, n_components=500, max_process=6).fit(X_train, y_train)

X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))

In [8]:
param_grid = {
    'sampling_strategy': ['majority', 'not majority', 'all'],
    'min_samples_split': [2, 3],
    'replacement': [False],
    'n_estimators': [100]
}

brf = BalancedRandomForestClassifier()

In [9]:
grid_search = GridSearchCV(
    estimator=brf,
    param_grid=param_grid,
    cv=None,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)

In [10]:
grid_search.fit(X_train_hashing, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
{'min_samples_split': 2, 'n_estimators': 100, 'replacement': False, 'sampling_strategy': 'not majority'}
BalancedRandomForestClassifier(sampling_strategy='not majority')
0.5578274575385634


In [11]:
best_estimator = grid_search.best_estimator_
y_proba = best_estimator.predict_proba(X_test_hashing)[:, 1]
score = roc_auc_score(y_test, y_proba)
score

0.9193104565441405

In [12]:
y_pred = best_estimator.predict(X_test_hashing)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    170910
           1       0.81      0.44      0.57      6252

    accuracy                           0.98    177162
   macro avg       0.90      0.72      0.78    177162
weighted avg       0.97      0.98      0.97    177162



In [13]:
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()

print(f'tn: {tn}')
print(f'fp: {fp}')
print(f'fn: {fn}')
print(f'tp: {tp}')

tn: 170272
fp: 638
fn: 3477
tp: 2775


In [13]:
%%time
idee_test = pd.read_csv('test_identity.csv')
transaction_test = pd.read_csv('test_transaction.csv') 
merge_test = transaction_test.merge(idee_test, how='outer', on='TransactionID')
objects_test = merge_test.select_dtypes('object')
objects_test = objects_test.join(merge_test[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
objects_test.fillna("Unknown", inplace=True)
objects_test = objects_test.astype('category')
objects_test.columns = X_train.columns

Wall time: 28.3 s


In [14]:
X2_hashed = hashing_enc.transform(objects_test.reset_index(drop=True))

In [15]:
%%time
y_pred_test = best_estimator.predict(X2_hashed)

Wall time: 18.2 s


In [16]:
sub_data = {'TransactionID': np.array(merge_test.TransactionID), 'isFraud': y_pred_test}
submission = pd.DataFrame(data=sub_data)
submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0
1,3663550,1
2,3663551,0
3,3663552,0
4,3663553,0
...,...,...
506686,4170235,1
506687,4170236,0
506688,4170237,0
506689,4170238,0


In [None]:
#submission.to_csv('submission.csv', index=False, doublequote=False, sep=',')