# Credit Card Fraud Detection

## Importing Required Libraries

We will first import the necessary libraries for data processing, visualization, and model building.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

## Loading the Dataset

We load the dataset and explore its structure.

In [3]:
data = pd.read_csv("creditcard.csv")  

In [4]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [7]:
# Checking for null and duplicate values

data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [8]:
data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [9]:
data.duplicated().sum()

1081

In [10]:
# Data Preprocessing

X = data.drop(columns=['Class'])
y = data['Class']

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Undersampling 
As data is highly imbalanced so there is a need to perform undersampling this will increase the model accuracy. 

In [12]:
fraud = data[data["Class"] == 1].index
non_fraud = data[data["Class"] == 0].index

In [13]:
non_fraud_sample = np.random.choice(non_fraud, size=len(fraud), replace=False)
undersampled = np.concatenate([fraud, non_fraud_sample])

In [14]:
data_undersampled = data.loc[undersampled]

In [15]:
data_undersampled.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
170999,120449.0,2.06397,0.006309,-1.044578,0.410669,-0.081738,-1.201912,0.233628,-0.377167,0.429602,...,-0.280769,-0.636703,0.333282,0.066412,-0.274342,0.193013,-0.06574,-0.058424,0.89,0
254139,156598.0,-1.336972,0.586122,1.731874,-0.688714,-0.268617,-0.588559,0.241238,0.415884,-0.353379,...,-0.106474,-0.60964,-0.06713,0.065232,0.232045,0.387722,-0.139675,-0.050755,55.0,0
133549,80432.0,1.295106,0.211234,0.297101,0.507506,-0.269583,-0.716426,0.027159,-0.153502,0.133071,...,-0.294286,-0.865299,0.079174,-0.129673,0.271893,0.126683,-0.032036,0.013202,1.79,0
230592,146399.0,2.052455,-1.263379,-0.699983,-0.926914,-1.041312,-0.248748,-1.092513,-0.00966,0.217632,...,0.40884,1.004517,0.076417,0.498257,-0.188224,-0.088993,-0.006512,-0.029206,84.0,0
95317,65252.0,-0.139181,1.073295,0.98132,0.258572,0.515011,-0.398194,0.674585,-0.115165,-0.632995,...,-0.2062,-0.559388,-0.060814,-0.475305,-0.601959,0.137824,0.140199,0.167594,1.98,0


In [16]:
X_under = data_undersampled.drop(columns=["Class"])
y_under = data_undersampled["Class"]
X_under_scaled = scaler.fit_transform(X_under)

## Test-Train Split

In [17]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X_under_scaled, y_under, test_size=0.2, random_state=42, stratify=y_under)

## Model Training

Here we will train our model using Logistic Regression and Random Forest Classifier

In [18]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier()
}

## Training and Evaluating Multiple Models

We iterate over a dictionary of models and train each one on the training set at the end will evaluate its performance using key classification metrics:

- **Accuracy**:  Measures overall correctness.
- **Precision**: Measures how many predicted fraud cases were actually fraud.
- **Recall**:    Measures how many actual fraud cases were correctly identified.
- **F1 Score**:  Harmonic mean of Precision and Recall, balancing both.


In [19]:
# Looping through the dictionary of models
for name, model in models.items():
    print(f"\n------- {name} -------")
    
# Train model on model training dataset and make predictions on test set    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
#  Model Evaluation
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    


------- Logistic Regression -------
Accuracy: 0.9594
Precision: 0.9688
Recall: 0.9490
F1 Score: 0.9588

------- Random Forest -------
Accuracy: 0.9594
Precision: 0.9688
Recall: 0.9490
F1 Score: 0.9588
