In [None]:
# Lib Imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import category_encoders as ce

In [None]:
# Setting Pandas column display option
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

In [None]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('category')
    # print(objects.info())
    return objects
objects = pipeline(idee,transaction)

In [None]:
# objects.drop(columns=['M4','id_15','id_16','id_28','id_29','id_35','id_36','id_38','DeviceType','DeviceInfo'], inplace=True)
objects.shape

In [None]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int')

under = RandomUnderSampler()
X_rus, y_rus = under.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.3, random_state=123)

bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=True,
                                random_state=0,
                                n_estimators=100,
                                n_jobs=-1)


In [None]:
columns = X_train.columns.to_list()

def get_score(model, X, y, X_test, y_test):
    model.fit(X, y)
    y_pred = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_test, y_pred)
    return score

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
%%time
# n_components_list = np.arange(100, 1000, 100).tolist()
n_components_list = [500]
n_components_list_str = [str(i) for i in n_components_list]

fh_logit_scores = []

for n_components in n_components_list:
    hashing_enc = ce.HashingEncoder(cols=columns, n_components=n_components, max_process=4).fit(X_train, y_train)
    
    X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
    X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))
    
    fe_logit_score = get_score(bbc, X_train_hashing, y_train, X_test_hashing, y_test)
    fh_logit_scores.append(fe_logit_score)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(n_components_list_str, fh_logit_scores, linewidth=3)
plt.title('n_compontents vs roc_auc for feature hashing with logistic regression')
plt.xlabel('n_components')
plt.ylabel('score')
plt.show()

In [None]:
fh_logit_scores

In [None]:
growth_rate = np.exp(np.diff(np.log(fh_logit_scores))) - 1

In [None]:
growth_rate

In [None]:
# plt.figure(figsize=(8, 5))
# plt.plot(n_components_list_str, growth_rate, linewidth=3)
# plt.title('n_compontents vs growth_rate for feature hashing with logistic regression')
# plt.xlabel('n_components')
# plt.ylabel('GRate')
# plt.show()