In [1]:
# Lib Imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

In [2]:
# Setting Pandas column display option
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

Wall time: 16 s


In [4]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('object')
    #objects = objects.astype('category')
    print(objects.info())
    return objects
objects = pipeline(idee,transaction)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Data columns (total 38 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ProductCD      590540 non-null  object
 1   card4          590540 non-null  object
 2   card6          590540 non-null  object
 3   P_emaildomain  590540 non-null  object
 4   R_emaildomain  590540 non-null  object
 5   M1             590540 non-null  object
 6   M2             590540 non-null  object
 7   M3             590540 non-null  object
 8   M4             590540 non-null  object
 9   M5             590540 non-null  object
 10  M6             590540 non-null  object
 11  M7             590540 non-null  object
 12  M8             590540 non-null  object
 13  M9             590540 non-null  object
 14  id_12          590540 non-null  object
 15  id_15          590540 non-null  object
 16  id_16          590540 non-null  object
 17  id_23          590540 non-null  object
 18  id_2

In [5]:
# objects.drop(columns=['M4','id_15','id_16','id_28','id_29','id_35','id_36','id_38','DeviceType','DeviceInfo'], inplace=True)
print(objects.shape)
print(objects.isFraud.value_counts())

(590540, 38)
0    569877
1     20663
Name: isFraud, dtype: int64


In [6]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int') 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='all',
                                replacement=False,
                                random_state=0,
                                n_estimators=100)

brf = BalancedRandomForestClassifier(sampling_strategy='auto',
                                     replacement=False,
                                     random_state=0,
                                     n_estimators=100,
                                     n_jobs=-1)

rf = RandomForestClassifier(random_state=0,
                            class_weight='balanced',
                            n_estimators=100,
                            n_jobs=-1)

bada = RUSBoostClassifier(base_estimator=AdaBoostClassifier(),
                         sampling_strategy='auto',
                         replacement=True,
                         random_state=0,
                         n_estimators=100)

dtc = DecisionTreeClassifier(random_state=0, class_weight='balanced')

Wall time: 1.15 s


In [7]:
columns = X_train.columns.to_list()

def get_score(model, X, y, X_test, y_test):
    model.fit(X, y)
    y_pred = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_test, y_pred)
    return score

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(413378, 37) (177162, 37) (413378,) (177162,)


## Encoders

In [9]:
# hashing encoder
# n_components_list = np.arange(100, 4000, 100).tolist()
n_components_list = [1000]
n_components_list_str = [str(i) for i in n_components_list]

fh_model_scores = []

for n_components in n_components_list:
    hashing_enc = ce.HashingEncoder(cols=columns, n_components=n_components, max_process=6).fit(X_train, y_train)

    X_train_hashing = hashing_enc.transform(X_train.reset_index(drop=True))
    X_test_hashing = hashing_enc.transform(X_test.reset_index(drop=True))

    fe_model_score = get_score(brf, X_train_hashing, y_train, X_test_hashing, y_test)
    fh_model_scores.append(fe_model_score)

In [10]:
'''
# leave one out encoder
# %%time
leaveone_enc = ce.LeaveOneOutEncoder(cols=columns, sigma=0.05).fit(X_train, y_train)

X_train_leaveone = leaveone_enc.transform(X_train.reset_index(drop=True))
X_test_leaveone = leaveone_enc.transform(X_test.reset_index(drop=True))
    

fe_model_score = get_score(brf, X_train_leaveone, y_train, X_test_leaveone, y_test)
'''

'\n# leave one out encoder\n# %%time\nleaveone_enc = ce.LeaveOneOutEncoder(cols=columns, sigma=0.05).fit(X_train, y_train)\n\nX_train_leaveone = leaveone_enc.transform(X_train.reset_index(drop=True))\nX_test_leaveone = leaveone_enc.transform(X_test.reset_index(drop=True))\n    \n\nfe_model_score = get_score(brf, X_train_leaveone, y_train, X_test_leaveone, y_test)\n'

In [11]:
'''
# binary encoder
binary_enc = ce.BinaryEncoder(cols=columns, drop_invariant=True).fit(X_train, y_train)

X_train_binary = binary_enc.transform(X_train.reset_index(drop=True))
X_test_binary = binary_enc.transform(X_test.reset_index(drop=True))
    

fe_model_score = get_score(brf, X_train_binary, y_train, X_test_binary, y_test)
'''

'\n# binary encoder\nbinary_enc = ce.BinaryEncoder(cols=columns, drop_invariant=True).fit(X_train, y_train)\n\nX_train_binary = binary_enc.transform(X_train.reset_index(drop=True))\nX_test_binary = binary_enc.transform(X_test.reset_index(drop=True))\n    \n\nfe_model_score = get_score(brf, X_train_binary, y_train, X_test_binary, y_test)\n'

In [12]:
'''
# target encoder
target_enc = ce.TargetEncoder(cols=columns, min_samples_leaf=100.000, smoothing=1).fit(X_train, y_train)

X_train_target = target_enc.transform(X_train.reset_index(drop=True))
X_test_target = target_enc.transform(X_test.reset_index(drop=True))
    

fe_model_score = get_score(brf, X_train_target, y_train, X_test_target, y_test)
'''

'\n# target encoder\ntarget_enc = ce.TargetEncoder(cols=columns, min_samples_leaf=100.000, smoothing=1).fit(X_train, y_train)\n\nX_train_target = target_enc.transform(X_train.reset_index(drop=True))\nX_test_target = target_enc.transform(X_test.reset_index(drop=True))\n    \n\nfe_model_score = get_score(brf, X_train_target, y_train, X_test_target, y_test)\n'

In [13]:
'''
# weight of evidence encoder
evidence_enc = ce.WOEEncoder(cols=columns, randomized=True, sigma=0.6, regularization=0.6).fit(X_train, y_train)

#randomized=True, sigma=0.1, regularization=0.1

X_train_evidence = evidence_enc.transform(X_train.reset_index(drop=True))
X_test_evidence = evidence_enc.transform(X_test.reset_index(drop=True))
    

fe_model_score = get_score(brf, X_train_evidence, y_train, X_test_evidence, y_test)
'''

'\n# weight of evidence encoder\nevidence_enc = ce.WOEEncoder(cols=columns, randomized=True, sigma=0.6, regularization=0.6).fit(X_train, y_train)\n\n#randomized=True, sigma=0.1, regularization=0.1\n\nX_train_evidence = evidence_enc.transform(X_train.reset_index(drop=True))\nX_test_evidence = evidence_enc.transform(X_test.reset_index(drop=True))\n    \n\nfe_model_score = get_score(brf, X_train_evidence, y_train, X_test_evidence, y_test)\n'

In [14]:
#plt.figure(figsize=(8, 5))
#plt.plot(n_components_list_str, fh_model_scores, linewidth=3)
#plt.title('n_compontents vs roc_auc for feature hashing with logistic regression')
#plt.xlabel('n_components')
#plt.ylabel('score')
#plt.show()

In [15]:
fh_model_scores
#fe_model_score

[0.9258916704316547]

In [16]:
y_pred = brf.predict(X_test_hashing)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.85      0.92    170910
           1       0.17      0.86      0.29      6252

    accuracy                           0.85    177162
   macro avg       0.58      0.86      0.60    177162
weighted avg       0.97      0.85      0.89    177162



In [17]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()

print(f'tn: {tn}')
print(f'fp: {fp}')
print(f'fn: {fn}')
print(f'tp: {tp}')

tn: 145380
fp: 25530
fn: 878
tp: 5374


### Test submission

In [18]:
%%time
idee_test = pd.read_csv('test_identity.csv')
transaction_test = pd.read_csv('test_transaction.csv') 
merge_test = transaction_test.merge(idee_test, how='outer', on='TransactionID')
objects_test = merge_test.select_dtypes('object')
objects_test = objects_test.join(merge_test[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
objects_test.fillna("Unknown", inplace=True)
objects_test = objects_test.astype('object')
#objects_test = objects_test.astype('category')
objects_test.columns = X_train.columns

Wall time: 25.8 s


In [19]:
X2_hashed = hashing_enc.transform(objects_test.reset_index(drop=True))
#X2_evidence = evidence_enc.transform(objects_test.reset_index(drop=True))

In [21]:
%%time
y_pred = brf.predict(X2_hashed)

Wall time: 6.47 s


In [22]:
sub_data = {'TransactionID': np.array(merge_test.TransactionID), 'isFraud': y_pred}
submission = pd.DataFrame(data=sub_data)
submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0
1,3663550,1
2,3663551,0
3,3663552,0
4,3663553,0
...,...,...
506686,4170235,1
506687,4170236,0
506688,4170237,0
506689,4170238,0


In [23]:
submission.to_csv('submission.csv', index=False, doublequote=False, sep=',')

In [None]:
#growth_rate = np.exp(np.diff(np.log(fh_logit_scores))) - 1

In [None]:
#growth_rate

In [None]:
#plt.figure(figsize=(8, 5))
#plt.plot(n_components_list_str, growth_rate, linewidth=3)
#plt.title('n_compontents vs growth_rate for feature hashing with logistic regression')
#plt.xlabel('n_components')
#plt.ylabel('GRate')
#plt.show()