In [1]:
#Core Libraries
import seaborn as sns
import pandas as pd
import numpy as np
import warnings

#Visualization
from colorama import Fore, Style
import sweetviz as sv
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

#Metrics
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score

# Ensemble
from sklearn.ensemble import StackingClassifier, VotingClassifier

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'C:\Users\Zarul\Desktop\Data_Analyst\My_Project\Supervised\4_CreditCard_Fraud_Detection\creditcard_2023.csv')

In [3]:
df.head(2)

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568630 entries, 0 to 568629
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      568630 non-null  int64  
 1   V1      568630 non-null  float64
 2   V2      568630 non-null  float64
 3   V3      568630 non-null  float64
 4   V4      568630 non-null  float64
 5   V5      568630 non-null  float64
 6   V6      568630 non-null  float64
 7   V7      568630 non-null  float64
 8   V8      568630 non-null  float64
 9   V9      568630 non-null  float64
 10  V10     568630 non-null  float64
 11  V11     568630 non-null  float64
 12  V12     568630 non-null  float64
 13  V13     568630 non-null  float64
 14  V14     568630 non-null  float64
 15  V15     568630 non-null  float64
 16  V16     568630 non-null  float64
 17  V17     568630 non-null  float64
 18  V18     568630 non-null  float64
 19  V19     568630 non-null  float64
 20  V20     568630 non-null  float64
 21  V21     56

In [6]:
df.describe()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,...,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0
mean,284314.5,-5.638058000000001e-17,-1.319545e-16,-3.5187880000000004e-17,-2.879008e-17,7.997245e-18,-3.958636e-17,-3.198898e-17,2.1092730000000003e-17,3.9986230000000005e-17,...,4.7583610000000004e-17,3.9486400000000004e-18,6.194741e-18,-2.799036e-18,-3.1789050000000006e-17,-7.497417000000001e-18,-3.59876e-17,2.6091010000000002e-17,12041.957635,0.5
std,164149.486121,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,6919.644449,0.5
min,0.0,-3.495584,-49.96657,-3.18376,-4.951222,-9.952786,-21.11111,-4.351839,-10.75634,-3.751919,...,-19.38252,-7.734798,-30.29545,-4.067968,-13.61263,-8.226969,-10.49863,-39.03524,50.01,0.0
25%,142157.25,-0.5652859,-0.4866777,-0.6492987,-0.6560203,-0.2934955,-0.4458712,-0.2835329,-0.1922572,-0.5687446,...,-0.1664408,-0.4904892,-0.2376289,-0.6515801,-0.5541485,-0.6318948,-0.3049607,-0.2318783,6054.8925,0.0
50%,284314.5,-0.09363846,-0.1358939,0.0003528579,-0.07376152,0.08108788,0.07871758,0.2333659,-0.1145242,0.09252647,...,-0.03743065,-0.02732881,-0.05968903,0.01590123,-0.008193162,-0.01189208,-0.1729111,-0.01392973,12030.15,0.5
75%,426471.75,0.8326582,0.3435552,0.628538,0.7070047,0.4397368,0.4977881,0.5259548,0.04729905,0.5592621,...,0.1479787,0.4638817,0.1557153,0.7007374,0.5500147,0.6728879,0.334023,0.4095903,18036.33,1.0
max,568629.0,2.229046,4.361865,14.12583,3.201536,42.71689,26.1684,217.873,5.95804,20.27006,...,8.08708,12.63251,31.70763,12.96564,14.62151,5.623285,113.2311,77.25594,24039.93,1.0


In [7]:
my_report = sv.analyze(df, 'Class')
my_report.show_notebook(w='100%', h='full')

                                             |          | [  0%]   00:00 -> (? left)

In [8]:
#Since data V1-V28 is confidential, feature engineering is not possible. 

In [9]:
#Train,Validation and Test
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.20, random_state=42)

print('train_df.shape: ',train_df.shape)
print('val_df.shape: ',val_df.shape)
print('test_df.shape: ',test_df.shape)

train_df.shape:  (363923, 31)
val_df.shape:  (90981, 31)
test_df.shape:  (113726, 31)


In [10]:
X_cols = list(df.drop(['Class'], axis=1))
y_col = 'Class'

train_inputs, train_target = train_df[X_cols].copy(), train_df[y_col].copy()
val_inputs, val_target = val_df[X_cols].copy(), val_df[y_col].copy()
test_inputs, test_target = test_df[X_cols].copy(), test_df[y_col].copy()

In [11]:
# Define numeric features (categorical columns doesn't exist in dataset)
numerical_features = [
    'V1','V2','V3','V4','V5','V6','V7','V8','V9','V10',
    'V11','V12','V13','V14','V15','V16','V17','V18','V19','V20',
    'V21','V22','V23','V24','V25','V26','V27','V28','Amount'
]

#Preprocessor for pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95)) 
        ]), numerical_features),
    ])

#Models to train with default hyperparameter
models = {
    'KNeighbors': KNeighborsClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

#Creating pipeline
def create_pipeline(model):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

In [12]:
results = {}
for name, model in tqdm(models.items(), desc="Evaluating Models"):
    pipeline = create_pipeline(model)
    pipeline.fit(train_inputs, train_target)
    y_pred = pipeline.predict(val_inputs)

    accuracy = accuracy_score(val_target, y_pred)
    
    results[name] = {'Accuracy': accuracy}

for model_name, accuracy in results.items():
    print(f"{model_name}: ACCURACY={accuracy['Accuracy']:.4f}")

Evaluating Models: 100%|██████████| 2/2 [03:03<00:00, 91.63s/it] 

KNeighbors: ACCURACY=0.9973
CatBoost: ACCURACY=0.9995





In [13]:
#Ensembling
#Creating pipeline
def create_ensemble_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

#Models for ensembling
best_models = {
    'KNeighbors': KNeighborsClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0),
}

#StackingClassifier & VotingClassifier
stacking_classifier = StackingClassifier(
    estimators=[
        ('kneighbors', best_models['KNeighbors']),
        ('catboost', best_models['CatBoost']),
    ],
)

voting_classifier = VotingClassifier(
    estimators=[
        ('kneighbors', best_models['KNeighbors']),
        ('catboost', best_models['CatBoost']),
    ]
)

ensemble_models = {
    'Stacking': create_ensemble_pipeline(stacking_classifier),
    'Voting': create_ensemble_pipeline(voting_classifier)
}

In [14]:
ensemble_results = {}
for name, pipeline in tqdm(ensemble_models.items(), desc="Evaluating Ensemble"):
    pipeline.fit(train_inputs, train_target)
    y_pred = pipeline.predict(val_inputs)

    accuracy = accuracy_score(val_target, y_pred)
    
    ensemble_results[name] = {'Accuracy': accuracy}

for model_name, accuracy in ensemble_results.items():
    print(f"{model_name}: ACCURACY={accuracy['Accuracy']:.4f}")

Evaluating Ensemble: 100%|██████████| 2/2 [16:37<00:00, 498.85s/it]

Stacking: ACCURACY=0.9997
Voting: ACCURACY=0.9996





In [15]:
#Summary
#KNeighbors: ACCURACY=0.9973
#CatBoost: ACCURACY=0.9995

#Stacking: ACCURACY=0.9997
#Voting: ACCURACY=0.9996

#Stacking (KNeighbors + CatBoost) work best

In [16]:
#Helper function for final model (stacking = KNeighbors + CatBoost)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95)) 
        ]), numerical_features),
    ])

#Models for ensembling
best_models = {
    'KNeighbors': KNeighborsClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0),
}

stacking_classifier = StackingClassifier(
    estimators=[
        ('kneighbors', best_models['KNeighbors']),
        ('catboost', best_models['CatBoost']),
    ],
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', stacking_classifier)
])

pipeline.fit(train_inputs, train_target)

In [17]:
#Helper function for single input
def predict_fraud_(input_data):
    input_df = pd.DataFrame([input_data])
    prediction = pipeline.predict(input_df)
    return prediction[0]

In [26]:
#Single Input Predict
example_input = {
    'V1': 1.724955,
    'V2': -1.032151,
    'V3': -0.167094, 
    'V4': -1.304005,
    'V5': 0.007942,
    'V6': -0.497093,
    'V7': 0.431224,
    'V8': -0.310432,
    'V9': 0.127247,
    'V10': 1.280402,
    'V11': -0.592005,
    'V12': -0.559465,
    'V13': 1.517358,
    'V14': 2.192510	,
    'V15': -0.880859,
    'V16': -0.041408,
    'V17': 1.034441,
    'V18': 0.285828,
    'V19': -0.583600,
    'V20': -0.298212,
    'V21': -0.121933,
    'V22': 0.064814,
    'V23': -0.173478,
    'V24': -0.203415,
    'V25': 0.269615,
    'V26': 0.084079,
    'V27': -0.370434,
    'V28': -0.219972,
    'Amount': 6325.64,
}

predict_fraud = predict_fraud_(example_input)
if predict_fraud == 0:
    print(f'Fraud Prediction: Not a Fraud')
elif predict_fraud == 1:
    print(f'Fraud Prediction: Fraud')
else:
    print(f'Error. Please check input')

Fraud Prediction: Not a Fraud
