In [1]:
# Lib Imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer

In [2]:
# Setting Pandas column display option
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
%%time
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

Wall time: 16.1 s


In [4]:
%%time
def pipeline(idee, transaction):
    merge = transaction.merge(idee, how='outer', on='TransactionID')
    objects = merge.select_dtypes('object')
    objects = objects.join(merge[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
    objects['isFraud'] = merge['isFraud']
    objects.fillna("Unknown", inplace=True)
    objects = objects.astype('object')
    #objects = objects.astype('category')
    print(objects.info())
    return objects
objects = pipeline(idee,transaction)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Data columns (total 38 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ProductCD      590540 non-null  object
 1   card4          590540 non-null  object
 2   card6          590540 non-null  object
 3   P_emaildomain  590540 non-null  object
 4   R_emaildomain  590540 non-null  object
 5   M1             590540 non-null  object
 6   M2             590540 non-null  object
 7   M3             590540 non-null  object
 8   M4             590540 non-null  object
 9   M5             590540 non-null  object
 10  M6             590540 non-null  object
 11  M7             590540 non-null  object
 12  M8             590540 non-null  object
 13  M9             590540 non-null  object
 14  id_12          590540 non-null  object
 15  id_15          590540 non-null  object
 16  id_16          590540 non-null  object
 17  id_23          590540 non-null  object
 18  id_2

In [5]:
%%time
X  = objects.drop(columns='isFraud')
y = objects['isFraud'].astype('int') 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

Wall time: 1.04 s


In [6]:
brf = BalancedRandomForestClassifier(sampling_strategy='auto',
                                     replacement=False,
                                     random_state=0,
                                     n_estimators=100,
                                     n_jobs=-1)

In [7]:
class TargetEncoderCV(TargetEncoder):
    """Cross-fold target encoder.
    """
    
    def __init__(self, n_splits=5, shuffle=True, cols=None):
        """Cross-fold target encoding for categorical features.
        
        Parameters
        ----------
        n_splits : int
            Number of cross-fold splits. Default = 3.
        shuffle : bool
            Whether to shuffle the data when splitting into folds.
        cols : list of str
            Columns to target encode.
        """
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.cols = cols
        

    def fit(self, X, y):
        """Fit cross-fold target encoder to X and y
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.
            
        Returns
        -------
        self : encoder
            Returns self.
        """
        self._target_encoder = TargetEncoder(cols=self.cols)
        self._target_encoder.fit(X, y)
        return self

    
    def transform(self, X, y=None):
        """Perform the target encoding transformation.

        Uses cross-fold target encoding for the training fold,
        and uses normal target encoding for the test fold.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """

        # Use target encoding from fit() if this is test data
        if y is None:
            return self._target_encoder.transform(X)

        # Compute means for each fold
        self._train_ix = []
        self._test_ix = []
        self._fit_tes = []
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle)
        for train_ix, test_ix in kf.split(X):
            self._train_ix.append(train_ix)
            self._test_ix.append(test_ix)
            te = TargetEncoder(cols=self.cols)
            if isinstance(X, pd.DataFrame):
                self._fit_tes.append(te.fit(X.iloc[train_ix,:],
                                            y.iloc[train_ix]))
            elif isinstance(X, np.ndarray):
                self._fit_tes.append(te.fit(X[train_ix,:],
                                            y[train_ix]))
            else:
                raise TypeError('X must be DataFrame or ndarray')

        # Apply means across folds
        Xo = X.copy()
        for ix in range(len(self._test_ix)):
            test_ix = self._test_ix[ix]
            if isinstance(X, pd.DataFrame):
                Xo.iloc[test_ix,:] = \
                    self._fit_tes[ix].transform(X.iloc[test_ix,:])
            elif isinstance(X, np.ndarray):
                Xo[test_ix,:] = \
                    self._fit_tes[ix].transform(X[test_ix,:])
            else:
                raise TypeError('X must be DataFrame or ndarray')
        return Xo

            
    def fit_transform(self, X, y=None):
        """Fit and transform the data via target encoding.
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        return self.fit(X, y).transform(X, y)


In [8]:
te = TargetEncoderCV().fit(X_train, y_train)
X_train_cv = te.transform(X_train.reset_index(drop=True))
X_test_cv = te.transform(X_test.reset_index(drop=True))

In [9]:
brf.fit(X_train_cv, y_train)

BalancedRandomForestClassifier(n_jobs=-1, random_state=0)

In [10]:
y_proba = brf.predict_proba(X_test_cv)[:,1]
score = roc_auc_score(y_test, y_proba)
score

0.9230374394405948

In [11]:
y_pred = brf.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92    170910
           1       0.18      0.84      0.30      6252

    accuracy                           0.86    177162
   macro avg       0.59      0.85      0.61    177162
weighted avg       0.96      0.86      0.90    177162



In [12]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()

print(f'tn: {tn}')
print(f'fp: {fp}')
print(f'fn: {fn}')
print(f'tp: {tp}')

tn: 147190
fp: 23720
fn: 972
tp: 5280


In [13]:
%%time
idee_test = pd.read_csv('test_identity.csv')
transaction_test = pd.read_csv('test_transaction.csv') 
merge_test = transaction_test.merge(idee_test, how='outer', on='TransactionID')
objects_test = merge_test.select_dtypes('object')
objects_test = objects_test.join(merge_test[['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']])
objects_test.fillna("Unknown", inplace=True)
objects_test = objects_test.astype('object')
#objects_test = objects_test.astype('category')
objects_test.columns = X_train.columns

Wall time: 25.6 s


In [14]:
X2_cv = te.transform(objects_test.reset_index(drop=True))

In [15]:
y_pred_test = brf.predict(X2_cv)

In [16]:
sub_data = {'TransactionID': np.array(merge_test.TransactionID), 'isFraud': y_pred_test}
submission = pd.DataFrame(data=sub_data)
submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0
1,3663550,0
2,3663551,0
3,3663552,0
4,3663553,0
...,...,...
506686,4170235,0
506687,4170236,0
506688,4170237,0
506689,4170238,0


In [17]:
submission.to_csv('submission.csv', index=False, doublequote=False, sep=',')