In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

In [None]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('category')
    #print(objects.info())
    return objects
objects = pipeline(idee,transaction)

In [None]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

columns = X_train.columns.to_list()

In [None]:
hashing_enc = ce.HashingEncoder(cols=columns, n_components=n_components, max_process=6).fit(X_train, y_train)

X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))

In [None]:
param_grid = {
    'max_depth': [80, 100],
    'max_features': [2, 5],
    'min_samples_leaf': [3, 6],
    'min_samples_split': [8, 11],
    'n_estimators': [100, 200]
}

rf = RandomForestRegressor()

In [None]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    score='f1'
)

In [None]:
grid_search.fit(X_train_hashing, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

In [None]:
best_estimator = grid_search.best_estimator_
y_proba = best_estimator.predict_proba(X_test_hashing)[:, 1]
score = roc_auc_score(y_test, y_proba)
score