# **Credit Card Fraud Detection System**

## **Cuprins**

1. Introduction
2. Data cleaning
3. Data analysis
4. Data preprocessing for training
5. Models training and evaluation
6. Conclusions

### **1. Introduction**

#### **Requested Modules**

In [1]:
!pip install imblearn==0.0
!pip install plotly
!pip install xgboost
!pip install ipywidgets
!pip install --upgrade pip

# Manipularea și gestionarea datelor
import numpy as np
import pandas as pd
from datetime import datetime

# Data Visualization Models
import plotly.graph_objects as go
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import iplot

# Machine Learning Modules
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

Collecting imblearn==0.0
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn-compat<1,>=0.1
  Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting plotly
  Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m00:01

#### **Data Reading**

In [2]:
dataset = pd.read_csv("/kaggle/input/credit-card-fraud/card_transdata.csv")
dataset.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


### **2. Data cleaning**

#### **Data Shape Check**

In [3]:
print("Dataset Shape:", dataset.shape)

Dataset Shape: (1000000, 8)


#### **Columns data type check**

In [4]:
print("\nData columns types before conversion:")
dataset.dtypes


Data columns types before conversion:


distance_from_home                float64
distance_from_last_transaction    float64
ratio_to_median_purchase_price    float64
repeat_retailer                   float64
used_chip                         float64
used_pin_number                   float64
online_order                      float64
fraud                             float64
dtype: object

#### **Finding and handling NULL values**

In [5]:
print("Null Values:")
dataset.isnull().sum()

Null Values:


distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

#### **Identifying and Managing Duplicated Records**

In [6]:
if  dataset.duplicated().sum() > 0:
  print("Before: ", dataset.duplicated().sum())
  dataset = dataset.drop_duplicates()
  print("After: ", dataset.duplicated().sum())
else:
  print("No duplicated records found.")

No duplicated records found.


### **3. Data Analysis**

#### **Pie Chart Function**

In [7]:
def pie_chart(dataset, feature_name, title):
    fig_fraudulent = go.Figure()
    if feature_name in ['Class', 'fraud']:
        transactions = dataset[feature].value_counts()
    else:
        fraudulent = dataset.loc[dataset['fraud'] == 1, feature_name].value_counts().sort_index()
        legitimate = dataset.loc[dataset['fraud'] == 0, feature_name].value_counts().sort_index()
        transactions = pd.concat([fraudulent, legitimate], axis=1, keys=['Fraudulent', 'Legitimate'])
    fig_fraudulent.add_trace(go.Pie(labels=["Legitimate", "Fraudulent"], values=fraudulent.values, name="Fraudulent", marker_colors=['#4F89FE', '#FFBC35'], pull=[0.1, 0], textinfo='percent+label', textposition='outside', textfont={'color': 'black', 'size': 24}))
    fig_fraudulent.update_layout(title_text=title, showlegend=False, title_x=0.5,title_y=0.99, font_size=20)
    fig_fraudulent.update_xaxes(title_text=feature_name, dtick=1, range=[-0.5,23.5])
    fig_fraudulent.update_yaxes(title_text="Transaction Number")
    fig_fraudulent.show()

#### **Bar Chart Function**

In [8]:
def bar_chart(dataset, feature, feature_name, title):
    
    fig_fraudulent = go.Figure()
    fraudulent = dataset[feature].value_counts()
    fig_fraudulent.add_trace(go.Bar(x=fraudulent.index, y=fraudulent.values, name="Fraudulent", marker_color="red"))
    fig_fraudulent.update_layout(title_text=title, showlegend=False, title_x=0.5, height=600, width=1000, font=dict(size=16))
    fig_fraudulent.update_xaxes(title_text=feature_name, dtick=1, range=[-0.5,23.5])
    fig_fraudulent.update_yaxes(title_text="Transaction number")
    fig_fraudulent.show()

#### **Histogram Function**

In [9]:
def histogram_chart(dataset, feature, feature_name, title):

    trace_amount_fraud = go.Histogram(x=dataset[feature], marker=dict(color='red'), xbins=dict(start=0, end=2500, size=50))
    layout_amount_fraud = go.Layout(title=title, xaxis=dict(title=feature_name), yaxis=dict(title='Transaction n'), bargap=0.1, title_x=0.5, height=600, width=900, font=dict(size=16))
    fig_amount_fraud = go.Figure(data=[trace_amount_fraud], layout=layout_amount_fraud)

    iplot(fig_amount_fraud)

#### **Graphical visualization of meaningful data**

In [10]:
pie_chart(dataset, 'repeat_retailer', 'Repeat Retailer')

In [11]:
pie_chart(dataset, 'used_chip', 'Used chip')

In [12]:
pie_chart(dataset, 'used_pin_number', 'Used PIN code')

In [13]:
pie_chart(dataset, 'online_order', 'Online Transaction')

#### **Class distribution for credit card transactions**

In [14]:
#pie_chart(dataset, 'fraud', 'Class distribution of fraudulent and legitimate transactions')

### **4. Data Preprocessing**

#### **Data Scaling by using RobustScaler**

##### **Data Scaling Function**

In [15]:
def data_scaling(data, feature):
    return RobustScaler().fit_transform(data[[feature]])

##### **Data Scaling**

In [16]:
dataset['Scaled distance_from_home'] = data_scaling(dataset, 'distance_from_home')
dataset['Scaled distance_from_last_transaction'] = data_scaling(dataset, 'distance_from_last_transaction')
dataset['Scaled ratio_to_median_purchase_price'] = data_scaling(dataset, 'ratio_to_median_purchase_price')
dataset = dataset.drop(['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price'], axis=1)
dataset.head()

Unnamed: 0,repeat_retailer,used_chip,used_pin_number,online_order,fraud,Scaled distance_from_home,Scaled distance_from_last_transaction,Scaled ratio_to_median_purchase_price
0,1.0,1.0,0.0,0.0,0.0,2.19108,-0.224744,0.585071
1,1.0,0.0,0.0,0.0,0.0,0.03943,-0.269055,0.182947
2,1.0,0.0,0.0,1.0,0.0,-0.223026,-0.063254,-0.351702
3,1.0,1.0,0.0,1.0,0.0,-0.353069,1.504177,-0.39184
4,1.0,1.0,0.0,1.0,0.0,1.565134,-0.141273,0.755879


#### **Handling Outliers**

##### **Outlier detection function**

In [17]:
def detect_replace_outliers(df, column_names):
    outliers = {}
    for column in column_names:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
        outliers[column] = outliers_count

        median = df[column].median()
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), median, df[column])

    return df, outliers

##### **Finding and handling outliers**

In [18]:
# Specifying columns for verification
columns_to_check = ['Scaled distance_from_home', 'Scaled distance_from_last_transaction', 'Scaled ratio_to_median_purchase_price']

# Replacing outliers with median value
dataset, detected_outliers = detect_replace_outliers(dataset, columns_to_check)

print("The number of detected outliers:")
detected_outliers

The number of detected outliers:


{'Scaled distance_from_home': 103631,
 'Scaled distance_from_last_transaction': 124367,
 'Scaled ratio_to_median_purchase_price': 84386}

#### **Dataset splitting into features (X) and targets (y) datasets**

In [19]:
X = dataset.drop(['fraud'], axis=1)
y = dataset['fraud']

#### **Finding the most important features by using SelectKBest**

In [20]:
selector = SelectKBest(mutual_info_classif, k='all')
X_new = selector.fit_transform(X, y)

#Finding the most important features
featureScores_df = pd.DataFrame({'Features': X.columns, 'Score': selector.scores_}).sort_values(by='Score', ascending=False)
colorscale=[[0, '#4F89FE'], [.5,'#F3F3F3'], [1, '#F3F3F3']]

#Dataset visualization as a table
fig = ff.create_table(featureScores_df, index=False, colorscale=colorscale)
fig.update_layout(font=dict(size=16))
fig.show()

### **5. Model training and evaluation**

#### **Splitting the features dataset into train and test datasets**

In [21]:
#I chose a 70-30 distribution: 70% training, 30% testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### **SMOTE Technique for handling the imbalanced dataset**

In [22]:
print("Before SMOTE:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Before SMOTE:
X_train shape: (700000, 7)
y_train shape: (700000,)
X_test shape: (300000, 7)
y_test shape: (300000,)
After SMOTE:
X_train shape: (1277452, 7)
y_train shape: (1277452,)
X_test shape: (300000, 7)
y_test shape: (300000,)


#### **Model training functions**

##### **1. Training function with GridSearchCV and StratifiedKFold methods**

In [23]:
def model_with_gridsearch(model, param_grid, name):

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_macro', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Model:", name)
    print("Best hyperparameters:", grid.best_params_)
    print("F1-macro score from training:", grid.best_score_)

    y_pred = grid.predict(X_test)
    print("F1-macro score from testing", f1_score(y_test, y_pred, average='macro'))
    return y_pred

##### **2. Training function with neural networks**

In [24]:
def feed_forward_training_with(model, criterion, optimizer, train_loader, test_loader, y_test, X_train_tensor):
    num_epochs = 200
    for epoch in range(num_epochs):
        model.train()
        for xb, yb in train_loader:
            preds = model(xb)
            loss = criterion(preds, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    all_preds = []
    with torch.no_grad():
        for xb, yb in test_loader:
            preds = model(xb)
            preds = torch.sigmoid(preds) >= 0.5
            all_preds.extend(preds.numpy())


    all_preds = [item for sublist in all_preds for item in sublist]

    print("Model: Feed Forward Neural Network")
    print("Best hyperparameters:", f1_score(y_train, (torch.sigmoid(model.forward(X_train_tensor)) >= 0.5).float().numpy(), average='macro'))
    print("F1-macro score from test:", f1_score(y_test.to_numpy(), all_preds, average='macro'))
    return all_preds

#### **Model evaluation functions**

##### **1. Confusion matrix function**

In [25]:
def plot_confusion_matrix(y_test, y_pred, name):
  
    matrix = confusion_matrix(y_test, y_pred)
    fig = ff.create_annotated_heatmap(z=matrix[::-1],
                                      x=["Legitimate transactions", "Fraudulent transactions"],
                                      y=["Fraudulent transactions", "Legitimate transactions"],
                                      colorscale=[[0, '#F3F3F3'], [.5,'#F3F3F3'], [1, '#4F89FE']],
                                      showscale=True)
    fig.update_layout(title=name, title_x=0.60, font=dict(size=18))
    return fig


##### **2. Classification report function**

In [26]:
def generate_classification_report(y_test, y_pred):

    report = classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent'], output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    colorscale=[[0, '#4F89FE'], [.5,'#F3F3F3'], [1, '#F3F3F3']]  
    fig = ff.create_table(report_df, index=True, colorscale=colorscale)
    return fig


#### **Logistic Regression Model**

##### **1. Model Training**

In [27]:

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) 

# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=10000)

# Logistic Regression Hyperparameters
param_grid_lr = {
    'solver': ['lbfgs', 'saga'],
    'penalty': ['l2'],
    'C': [100, 10, 1.0, 0.1, 0.01, 0.001]
}

y_pred_lr = model_with_gridsearch(lr, param_grid_lr,"Logistic Regression")


Model: Logistic Regression
Best hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}
F1-macro score from training: 0.7066703894234629
F1-macro score from testing 0.47875902209502863


##### **2. Model Evaluation**

In [28]:
plot_confusion_matrix(y_test, y_pred_lr, "Logistic Regression")

In [29]:
generate_classification_report(y_test, y_pred_lr)

#### **Random Forest Classifier**

##### **1. Model Training**

In [30]:
# Random Forest Classifier
rfc = RandomForestClassifier()

# Random Forest Classifier Hyperparameters
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

y_pred_rfc = model_with_gridsearch(rfc, param_grid_rfc, "Random Forest Classifier")

Model: Random Forest Classifier
Best hyperparameters: {'max_depth': 30, 'max_features': 'log2', 'n_estimators': 300}
F1-macro score from training: 0.9737983935594554
F1-macro score from testing 0.8903407049789605


##### **2. Model Evaluation**

In [31]:
plot_confusion_matrix(y_test, y_pred_rfc,'Random Forest Classifier')

In [32]:
generate_classification_report(y_test, y_pred_rfc)

#### **Linear Support Vector Classifier**

##### **1. Model Training**

In [33]:
# Linear Support Vector Classifier
lsvc = LinearSVC(random_state=42, dual=False)

# Linear Support Vector Classifier Hyperparameters
param_grid_lsvc = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 50000],
}

y_pred_lsvc = model_with_gridsearch(lsvc, param_grid_lsvc, "Linear Support Vector Classifier")

Model: Linear Support Vector Classifier
Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
F1-macro score from training: 0.694624066516194
F1-macro score from testing 0.45772951563104114


##### **2. Model Evaluation**

In [34]:
plot_confusion_matrix(y_test, y_pred_lsvc, 'Linear Support Vector Classifier')

In [35]:
generate_classification_report(y_test, y_pred_lsvc)

#### **XGBoost Classifier**

##### **1. Model Training**

In [36]:
# XGBoost Classifier
xgb = XGBClassifier(random_state=42)

# XGBoost Hyperparameters
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'gamma': [0, 1]
}

y_pred_xgb =model_with_gridsearch(xgb, param_grid_xgb, "XGBoost Classifier")

Model: XGBoost Classifier
Best hyperparameters: {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
F1-macro score from training: 0.970796695135042
F1-macro score from testing 0.8814124738065188


##### **2. Model Evaluation**

In [37]:
plot_confusion_matrix(y_test, y_pred_xgb, 'XGBoost Classifier')

In [38]:
generate_classification_report(y_test, y_pred_xgb)

#### **Feed Forward Neural Network**

##### **1. Model Training**

In [39]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(FeedForwardNeuralNetwork, self).__init__()
        # La intrare s-a adăugat numărul de caracteristici din setul de date
        self.fc1 = nn.Linear(input_size, 7)
        self.relu = nn.ReLU()
        # Doar un strat ascuns
        self.fc2 = nn.Linear(7, 4)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Datasets conversion to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Adapting training and test sets for neural networks
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=4096, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=4096, shuffle=False, num_workers=4)

# Initializing the model, Adam optimizer and loss function
model = FeedForwardNeuralNetwork(input_size=X_train.shape[1])
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Using BCEWithLogitsLoss, recommended for binary classification
criterion = nn.BCEWithLogitsLoss()

all_preds = feed_forward_training_with(model, criterion, optimizer, train_loader, test_loader, y_test, X_train_tensor)

Model: Feed Forward Neural Network
Best hyperparameters: 0.9398897172478695
F1-macro score from test: 0.8490921966266103


##### **2. Model Evaluation**

In [40]:
plot_confusion_matrix(y_test, all_preds, 'Feed Forward Neural Network')

In [41]:
generate_classification_report(y_test, all_preds)