In [None]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV
import category_encoders as ce

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

In [None]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('category')
    #print(objects.info())
    return objects
objects = pipeline(idee,transaction)

In [None]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

columns = X_train.columns.to_list()

In [None]:
hashing_enc = ce.HashingEncoder(cols=columns, n_components=500, max_process=6).fit(X_train, y_train)

X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))

In [None]:
param_grid = {
    'sampling_strategy': ['auto', 'majority', 'not majority', 'all'],
    'class_weight': [None, 'balanced', 'balanced_subsample']
    'max_features': ['auto', 0.5],
    'replacement': [True, False],
    'n_estimators': [80, 100, 120]
}

brf = BalancedRandomForestClassifier()

In [None]:
grid_search = GridSearchCV(
    estimator=brf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    score='f1',
    random_state=0
)

In [None]:
grid_search.fit(X_train_hashing, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

In [None]:
best_estimator = grid_search.best_estimator_
y_proba = best_estimator.predict_proba(X_test_hashing)[:, 1]
score = roc_auc_score(y_test, y_proba)
score

In [None]:
y_pred = best_estimator.predict(X_test_hashing)
print(classification_report(y_test, y_pred))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()

print(f'tn: {tn}')
print(f'fp: {fp}')
print(f'fn: {fn}')
print(f'tp: {tp}')

In [None]:
%%time
idee_test = pd.read_csv('test_identity.csv')
transaction_test = pd.read_csv('test_transaction.csv') 
merge_test = transaction_test.merge(idee_test, how='outer', on='TransactionID')
objects_test = merge_test.select_dtypes('object')
objects_test = objects_test.join(merge_test[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
objects_test.fillna("Unknown", inplace=True)
objects_test = objects_test.astype('category')

In [None]:
X2_hashed = hashing_enc.transform(objects_test.reset_index(drop=True))

In [None]:
%%time
y_pred_test = best_estimator.predict(X2_hashed)

In [None]:
sub_data = {'TransactionID': np.array(merge_test.TransactionID), 'isFraud': y_pred_test}
submission = pd.DataFrame(data=sub_data)
submission

In [None]:
#submission.to_csv('submission.csv', index=False, doublequote=False, sep=',')