In [None]:
"""
Fraud Detection Model Training Script

This script contains a structured pipeline for fraud detection using multiple machine learning classifiers.
"""

# Install necessary libraries (uncomment if needed)
# %pip install imbalanced-learn category_encoders scikit-learn xgboost lightgbm

In [86]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [1]:
## import necessary libraries
import pandas as pd # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
### Read files
train_transaction_v0 = pd.read_csv("./train_transaction.csv")
train_identity = pd.read_csv("./train_identity.csv")
test_transaction_v0 = pd.read_csv("./test_transaction.csv")
test_identity = pd.read_csv("./test_identity.csv")

In [15]:
## Merge the transaction and the identify dataset
train_transaction = pd.merge(train_transaction_v0, train_identity, on="TransactionID", how="left")

test_transaction = pd.merge(test_transaction_v0, test_identity, on="TransactionID", how="left")

In [4]:
## Check for duplicated records
for df in [train_transaction,train_identity]:
    duplicate = df.duplicated().sum()
    print(f"Count of duplicated records is: {duplicate}")
    
## Check for dataset shape
for df in [train_transaction,train_identity]:
    shape = df.shape
    print(f"Shape of dataset is: {shape[0]:,} rows and {shape[1]:,} columns")

Count of duplicated records is: 0
Count of duplicated records is: 0
Shape of dataset is: 590,540 rows and 434 columns
Shape of dataset is: 144,233 rows and 41 columns


In [5]:
## Separate the input and target features
features = train_transaction.drop(columns="isFraud")
target = train_transaction[["isFraud"]]

### Split the transaction data using 80% trainset
X_train, X_test, Y_train, Y_test = train_test_split(features, target, train_size=0.8)


In [8]:
### Preprocessing Class

## create custom class to transform the transaction date column to Timedelta
class tran_dt(BaseEstimator,TransformerMixin):
    def __init__(self, col = "TransactionDT"):
        self.col=col
    
    def fit(self, X, Y=None):
        """Fit does nothing as no learning is required"""
        return self
    
    def transform(self,X):
        X_new = X.copy()
        X_new[self.col] = (X_new[self.col]/(24*60*60)).astype("float")
        return X_new
    
    
## create custom class to drop High NAs columns in train_transaction
class drop_na(BaseEstimator, TransformerMixin):
    def __init__(self, threshold = 0.2, ID_col = "TransactionID"):
        self.threshold = threshold
        self.ID_col = ID_col
        self.column_to_drop = []
        
    def fit(self, X, Y=None):
        self.column_to_drop = [self.ID_col] + [
            col for col in X.columns if X[col].isna().sum()>= self.threshold * len(X)
            ]
        return self
    
    def transform(self, X):
        X_new = X.copy()
        X_new = X_new.drop(columns = self.column_to_drop, errors ="ignore")
        return X_new
    
## create custom class to treat the missing values in the P_emaildomain using "nomail.com"
class email_na(BaseEstimator,TransformerMixin):
    def __init__(self, col = "P_emaildomain"):
        self.col=col
    
    def fit(self, X, Y=None):
        """Fit does nothing as no learning is required"""
        return self
    
    def transform(self,X):
        X_new = X.copy()
        X_new.loc[X_new[self.col].isna(), self.col] = "nomail.com"
        return X_new
        

## create custom class to treat the missing values in the add1|2 using 0
class addr_na_handler(BaseEstimator,TransformerMixin):
    def __init__(self, cols=["addr1", "addr2"]):
        self.cols = cols
        
    def fit(self, X, Y=None):
        """Fit does nothing as no learning is required"""
        return self
    
    def transform(self, X):
        X_new = X.copy()
        for col in self.cols:
            X_new.loc[X_new[col].isna(), col] = 0.0
        return X_new

 

## Create a custum class to treat missing values
class imputer(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.non_numeric_col = None
        self.numeric_col = None
        self.mode_imputer = SimpleImputer(strategy="most_frequent")
        self.median_imputer = SimpleImputer(strategy="median")
        
    def fit(self, X, Y=None):
        self.non_numeric_col = X.select_dtypes(exclude=[np.number]).columns
        self.numeric_col = X.select_dtypes(include=[np.number]).columns
        self.mode_imputer.fit(X[self.non_numeric_col])
        self.median_imputer.fit(X[self.numeric_col])
        return self
    
    def transform(self, X):
        X_new = X.copy()
        X_new[self.numeric_col] = self.median_imputer.transform(X[self.numeric_col])
        X_new[self.non_numeric_col] = self.mode_imputer.transform(X[self.non_numeric_col])
        return X_new
    
## Create a custom class to encode categorical variables
class encoder(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.non_numeric_cols = None
        self.encoder = None
        
    def fit(self, X, Y):
        self.non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns
        self.encoder = ce.TargetEncoder(cols=self.non_numeric_cols)
        self.encoder.fit(X[self.non_numeric_cols], Y)
        return self
        
    def transform(self, X):
        X_new = X.copy()
        X_new[self.non_numeric_cols] = self.encoder.transform(X[self.non_numeric_cols])
        return X_new

In [10]:
### Model Development

# Create pipeline for the randomforest classifier
pipeline_rfc = make_pipeline(tran_dt(), drop_na(),email_na(),
                         addr_na_handler(),imputer(),encoder(),
                         SMOTE(random_state=42), StandardScaler(),
                         RandomForestClassifier(n_estimators=100,random_state=42, n_jobs=-1))

# Create pipeline for the logistic classifier
pipeline_lgr = make_pipeline(tran_dt(), drop_na(),email_na(),
                         addr_na_handler(),imputer(),encoder(),
                         SMOTE(random_state=42), StandardScaler(),
                         LogisticRegression( solver="saga"))

# Create pipeline for the GradientBoost classifier
pipeline_gb = make_pipeline(tran_dt(), drop_na(),email_na(),
                         addr_na_handler(),imputer(),encoder(),
                         SMOTE(random_state=42), StandardScaler(),
                         GradientBoostingClassifier(random_state=42))

# Create pipeline for the HistGradientBoost classifier
pipeline_hgb = make_pipeline(tran_dt(), drop_na(),email_na(),
                         addr_na_handler(),imputer(),encoder(),
                         SMOTE(random_state=42), StandardScaler(),
                         HistGradientBoostingClassifier(random_state=42))

# Create pipeline for the xgboost classifier
pipeline_xgb = make_pipeline(tran_dt(), drop_na(threshold=1.1),email_na(),
                         addr_na_handler(),imputer(),encoder(),
                         SMOTE(random_state=42), StandardScaler(),
                         xgb.XGBClassifier(learning_rate=0.01,
                                           eval_metric ="auc",random_state=42,subsample=1,
                                           use_label_encoder=False, n_jobs=-1))

## Stacking Ensemble
st_clf = StackingClassifier(
    estimators=[("rf_clf",pipeline_rfc), ("logistic",pipeline_lgr),
                ("GradientBoost",pipeline_gb),("HistGB",pipeline_hgb),
                ("xgb_cfl", pipeline_xgb)],
    final_estimator= lgb.LGBMClassifier()
)

In [11]:
## Fit the models

for model in [pipeline_rfc,pipeline_lgr,pipeline_gb, pipeline_hgb,pipeline_xgb, st_clf]:
    model.fit(X_train,Y_train)

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Parameters: { "use_label_encoder" } are not used.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 16567, number of negative: 455865
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1121
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035067 -> initscore=-3.314784
[LightGBM] [Info] Start training from score -3.314784


In [14]:
from sklearn.metrics import accuracy_score, roc_auc_score

models_ = {"Randomforest":pipeline_rfc, "Logistic":pipeline_lgr,
           "GradientBoost":pipeline_gb, "HistGB":pipeline_hgb,
           "XGBoost":pipeline_xgb, "Stacked":st_clf}

for key, value in models_.items():
    pred = value.predict(X_test)
    pred_prob = value.predict_proba(X_test)
    acc = accuracy_score(Y_test, pred)
    auc = roc_auc_score(Y_test,pred_prob[:,1])
    print(f"Accuracy of {key} model is {acc:.6f}")
    print(f"ROC of {key} model is {auc:.6f}")
    



Accuracy of Randomforest model is 0.980171
ROC of Randomforest model is 0.919911
Accuracy of Logistic model is 0.754597
ROC of Logistic model is 0.822420
Accuracy of GradientBoost model is 0.943907
ROC of GradientBoost model is 0.875694
Accuracy of HistGB model is 0.973558
ROC of HistGB model is 0.902777
Accuracy of XGBoost model is 0.938929
ROC of XGBoost model is 0.861592




Accuracy of Stacked model is 0.981043
ROC of Stacked model is 0.935505




In [None]:
### Rename test columns to fix the name issue

## create a name dictionary
col_names_dict = dict(zip(test_transaction.columns, train_transaction.drop(columns=["isFraud"]).columns))

## Rename the test data columns
test_data = test_transaction.rename(columns=col_names_dict)



In [None]:
### Make prediction with the test data

prediction = pipeline_hgb.predict_proba(test_data)

## create a submission dataframe
submission_file = test_data[["TransactionID"]]

## Add the prediction to the submission dataframe
submission_file["isFraud"] = pd.DataFrame(prediction[:,1].round(1))

## Save submission file to computer
submission_file.to_csv("submission.csv")