# **Credit Card Fraud Detection System**

## **Table of contents**

1. Introduction
2. Data cleaning
3. Data analysis
4. Data preprocessing for training
5. Models training and evaluation
6. Conclusions



### **1. Introduction**

#### **Requested Modules**

In [1]:
!pip install imblearn==0.0
!pip install plotly
!pip install xgboost
!pip install ipywidgets
!pip install --upgrade pip

# Data manipulation modules
import numpy as np
import pandas as pd
from datetime import datetime

# Data visualization modules
import plotly.graph_objects as go
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Machine learning modules
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

Collecting imblearn==0.0
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.2
    Uninstalling pip-23.3.2:
      Successfully uninstalled pip-23.3.2
Successfully installed pip-24.3.1


#### **Data reading**

In [2]:
dataset = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### **2. Data cleaning**

#### **Database shaping**

In [3]:
print("Dataset Shape:", dataset.shape)

Dataset Shape: (284807, 31)


#### **Columns data type check¶**

In [4]:
print("\nData columns types before conversion:")
dataset.dtypes


Data columns types before conversion:


Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

#### **Finding and handling NULL values**

In [5]:
print("Null Values:")
dataset.isnull().sum()

Null Values:


Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

#### **Identifying and Managing Duplicated Records**

In [6]:
if  dataset.duplicated().sum() > 0:
  print("Before: ", dataset.duplicated().sum())
  dataset = dataset.drop_duplicates()
  print("After: ", dataset.duplicated().sum())
else:
  print("No duplicated records found.")

Before:  1081
After:  0


### **3. Data Analysis**

#### **Pie chart function**

In [7]:
def pie_chart(dataset, feature_name, title):
    fig_fraudulent = go.Figure()
    fraudulent = dataset[feature_name].value_counts().sort_index()
    fig_fraudulent.add_trace(go.Pie(labels=["Legitimate", "Fraudulent"], values=fraudulent.values, name="Fraudulent", marker_colors=['#4F89FE', '#FFBC35'], pull=[0.1, 0], textinfo='percent+label', textposition='outside', textfont={'color': 'black', 'size': 17}))
    fig_fraudulent.update_layout(title_text=title, showlegend=False, title_x=0.5, font_size=16)
    fig_fraudulent.update_xaxes(title_text=feature_name, dtick=1, range=[-0.5,23.5])
    fig_fraudulent.update_yaxes(title_text="Transactions number")
    return fig_fraudulent

#### **Bar chart function**

In [8]:
def bar_chart(dataset, feature, feature_name, title):

    fig_fraudulent = go.Figure()
    fraudulent = dataset[feature].value_counts()
    fig_fraudulent.add_trace(go.Bar(x=fraudulent.index, y=fraudulent.values, name="Fraudulent", marker_color="red"))
    fig_fraudulent.update_layout(title_text=title, showlegend=False, title_x=0.5, height=600, width=1000, font=dict(size=16))
    fig_fraudulent.update_xaxes(title_text=feature_name, dtick=1, range=[-0.5,23.5])
    fig_fraudulent.update_yaxes(title_text="Transactions number")
    fig_fraudulent.show()

#### **Histogram function**

In [9]:
import plotly.graph_objs as go
from plotly.offline import iplot

def histogram_chart(dataset, feature, feature_name, title):

    trace_amount_fraud = go.Histogram(x=dataset[feature], marker=dict(color='red'), xbins=dict(start=0, end=2500, size=50))
    layout_amount_fraud = go.Layout(title=title, xaxis=dict(title=feature_name), yaxis=dict(title='Transactions number'), bargap=0.1, title_x=0.5, height=600, width=900, font=dict(size=16))
    fig_amount_fraud = go.Figure(data=[trace_amount_fraud], layout=layout_amount_fraud)

    iplot(fig_amount_fraud)

#### **Data visualization for the representative columns**

In [10]:
fraud_transactions = dataset.loc[dataset["Class"] == 1]
fraud_transactions["Hour"] = fraud_transactions["Time"].apply(lambda x: datetime.utcfromtimestamp(x).hour)
bar_chart(fraud_transactions, 'Hour', 'Hour', 'Graphic representation of the times at which fraudulent transactions took place')

In [11]:
histogram_chart(fraud_transactions, 'Amount', 'Amount', 'Graphical representation of the most recurrent amounts in fraudulent activities')

#### **Class distribution for credit card transactions**

In [12]:
pie_chart(dataset, 'Class', 'Class distribution for credit card transactions')

### **4. Data Preprocessing**

#### **Data Scaling by using RobustScaler**

##### **Data scaling function**

In [13]:
def data_scaling(data, feature):
    return RobustScaler().fit_transform(data[[feature]])

##### **Data Scaling**

In [14]:
dataset['Scaled Time'] = data_scaling(dataset, 'Time')
dataset['Scaled Amount'] = data_scaling(dataset, 'Amount')
dataset = dataset.drop(['Time', 'Amount'], axis=1)
dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,Scaled Time,Scaled Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,-0.99529,1.774718
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.99529,-0.26853
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,-0.995279,4.959811
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,-0.995279,1.411487
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.995267,0.667362


#### **Outliers handling**

##### **Outliers handling function**

In [15]:
def detect_replace_outliers(df, column_names):
    outliers = {}
    for column in column_names:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
        outliers[column] = outliers_count

        median = df[column].median()
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), median, df[column])

    return df, outliers

##### **Finding and handling outliers**

In [16]:
# Columns to check
columns_to_check = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']

# Replacing outliers with median value
dataset, detected_outliers = detect_replace_outliers(dataset, columns_to_check)

print("The number of detected outliers:")
detected_outliers

The number of detected outliers:


{'V1': 6948,
 'V2': 13390,
 'V3': 3306,
 'V4': 11094,
 'V5': 12221,
 'V6': 22886,
 'V7': 8839,
 'V8': 23904,
 'V9': 8199,
 'V10': 9345,
 'V11': 735,
 'V12': 15282,
 'V13': 3362,
 'V14': 14060,
 'V15': 2884,
 'V16': 8180,
 'V17': 7353,
 'V18': 7468,
 'V19': 10150,
 'V20': 27553,
 'V21': 14401,
 'V22': 1298,
 'V23': 18467,
 'V24': 4758,
 'V25': 5333,
 'V26': 5665,
 'V27': 38799,
 'V28': 30094}

#### **Dataset splitting into features (X) and targets (y) datasets**

In [17]:
X = dataset.drop(['Class'], axis=1)
y = dataset['Class']

#### **Finding the most important features by using SelectKBest**

In [18]:
selector = SelectKBest(mutual_info_classif, k='all')
X_new = selector.fit_transform(X, y)

#Finding the most important features
featureScores_df = pd.DataFrame({'Features': X.columns, 'Score': selector.scores_}).sort_values(by='Score', ascending=False)
colorscale=[[0, '#4F89FE'], [.5,'#F3F3F3'], [1, '#F3F3F3']]

#Dataset visualization as a table
fig = ff.create_table(featureScores_df, index=False, colorscale=colorscale)
fig.update_layout(font=dict(size=16))
fig.show()

### **5.  Model training and evaluation**

#### **Splitting the features dataset into train and test datasets**

In [19]:
#I chose a 70-30 distribution: 70% training, 30% testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### **SMOTE Technique for handling the imbalanced dataset**

In [20]:
print("Before SMOTE:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Before SMOTE:
X_train shape: (198608, 30)
y_train shape: (198608,)
X_test shape: (85118, 30)
y_test shape: (85118,)
After SMOTE:
X_train shape: (396538, 30)
y_train shape: (396538,)
X_test shape: (85118, 30)
y_test shape: (85118,)


#### **Model training functions**

##### **1. Training function with GridSearchCV and StratifiedKFold methods**

In [21]:
def model_with_gridsearch(model, param_grid, name):

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_macro', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Model:", name)
    print("Best hyperparameters:", grid.best_params_)
    print("F1-macro score from training:", grid.best_score_)

    y_pred = grid.predict(X_test)
    print("F1-macro score from testing", f1_score(y_test, y_pred, average='macro'))
    return y_pred

##### **2. Training function with neural networks**

In [22]:
def feed_forward_training_with(model, criterion, optimizer, train_loader, test_loader, y_test, X_train_tensor):
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        for xb, yb in train_loader:
            preds = model(xb)
            loss = criterion(preds, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    all_preds = []
    with torch.no_grad():
        for xb, yb in test_loader:
            preds = model(xb)
            preds = torch.sigmoid(preds) >= 0.5
            all_preds.extend(preds.numpy())


    all_preds = [item for sublist in all_preds for item in sublist]

    print("Model: Feed Forward Neural Network")
    print("Best hyperparameters:", f1_score(y_train, (torch.sigmoid(model.forward(X_train_tensor)) >= 0.5).float().numpy(), average='macro'))
    print("F1-macro score from test:", f1_score(y_test.to_numpy(), all_preds, average='macro'))
    return all_preds

#### **Model evaluation functions**

##### **1. Confusion matrix function**

In [23]:
def plot_confusion_matrix(y_test, y_pred, name):
  
    matrix = confusion_matrix(y_test, y_pred)
    fig = ff.create_annotated_heatmap(z=matrix[::-1],
                                      x=["Legitimate transactions", "Fraudulent transactions"],
                                      y=["Fraudulent transactions", "Legitimate transactions"],
                                      colorscale=[[0, '#F3F3F3'], [.5,'#F3F3F3'], [1, '#4F89FE']],
                                      showscale=True)
    fig.update_layout(title=name, title_x=0.60, font=dict(size=18))
    return fig

##### **2. Classification report function**

In [24]:
def generate_classification_report(y_test, y_pred):

    report = classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent'], output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    colorscale=[[0, '#4F89FE'], [.5,'#F3F3F3'], [1, '#F3F3F3']]  
    fig = ff.create_table(report_df, index=True, colorscale=colorscale)
    return fig

#### **Logistic regression**

##### **1. Model Training**

In [25]:

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) 

# Regresia logistică
lr = LogisticRegression(random_state=42, max_iter=10000)

# Hiperparametrii pentru regresia logistică
param_grid_lr = {
    'solver': ['lbfgs', 'saga'],
    'penalty': ['l2'],
    'C': [100, 10, 1.0, 0.1, 0.01, 0.001]
}

y_pred_lr = model_with_gridsearch(lr, param_grid_lr,"Logistic regression")

Model: Logistic regression
Best hyperparameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
F1-macro score from training: 0.8006229259612349
F1-macro score from testing 0.46228723811306305


##### **2. Evaluare**

In [26]:
plot_confusion_matrix(y_test, y_pred_lr, "Logistic regression")

In [27]:
generate_classification_report(y_test, y_pred_lr)

#### **Random Forest Classifier**

##### **1. Model Training**

In [28]:
# Random Forest Classifier
rfc = RandomForestClassifier()

# Random Forest Classifier Hyperparameters
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

y_pred_rfc = model_with_gridsearch(rfc, param_grid_rfc, "Random Forest Classifier")

Model: Random Forest Classifier
Best hyperparameters: {'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 300}
F1-macro score from training: 0.9998234723995786
F1-macro score from testing 0.8904602747382045


##### **2. Model evaluation**

In [29]:
plot_confusion_matrix(y_test, y_pred_rfc,'Random Forest Classifier')

In [30]:
generate_classification_report(y_test, y_pred_rfc)

#### **Linear Support Vector Classifier**

##### **1. Model Training**

In [31]:
# Linear Support Vector Classifier
lsvc = LinearSVC(random_state=42, dual=False)

# Linear Support Vector Classifier Hyperparameters
param_grid_lsvc = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 50000],
}

y_pred_lsvc = model_with_gridsearch(lsvc, param_grid_lsvc, "Linear Support Vector Classifier")

Model: Linear Support Vector Classifier
Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
F1-macro score from training: 0.8001944937845893
F1-macro score from testing 0.4653413510812953


##### **2. Model Evaluation**

In [32]:
plot_confusion_matrix(y_test, y_pred_lsvc, 'Linear Support Vector Classifier')

In [33]:
generate_classification_report(y_test, y_pred_lsvc)

#### **XGBoost Classifier**

##### **1. Model training**

In [34]:
# XGBoost Classifier
xgb = XGBClassifier(random_state=42)

# XGBoost Classifier Hyperparameters
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'gamma': [0, 1]
}

y_pred_xgb =model_with_gridsearch(xgb, param_grid_xgb, "XGBoost Classifier")

Model: XGBoost Classifier
Best hyperparameters: {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
F1-macro score from training: 0.9998007758195893
F1-macro score from testing 0.8921429631694611


##### **2. Model Evaluation**

In [35]:
plot_confusion_matrix(y_test, y_pred_xgb, 'XGBoost Classifier')

In [36]:
generate_classification_report(y_test, y_pred_xgb)

#### **Feed Forward Neural Network**

##### **1. Model Training**

In [37]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(FeedForwardNeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 30)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(30, 15)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(15, 1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Converting the datasets into tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Adapting training and test sets for neural networks
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=4096, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=4096, shuffle=False, num_workers=4)

# Initializing the model, Adam optimizer and loss function
model = FeedForwardNeuralNetwork(input_size=X_train.shape[1])
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Using BCEWithLogitsLoss, recommended for binary classification
criterion = nn.BCEWithLogitsLoss()

all_preds = feed_forward_training_with(model, criterion, optimizer, train_loader, test_loader, y_test, X_train_tensor)

Model: Feed Forward Neural Network
Best hyperparameters: 0.9999974781735924
F1-macro score from test: 0.7665881229032747


##### **2. Model Evaluation**

In [38]:
plot_confusion_matrix(y_test, all_preds, 'Feed Forward Neural Network')

In [39]:
generate_classification_report(y_test, all_preds)