# Credit Card Fraud Detection using Anomaly Detection

Predict fraud transactions using Isolation Forest Algorithm.

## 1. Import the libraries

Import the required libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, precision_recall_curve
from sklearn.metrics import fbeta_score, roc_auc_score, roc_curve

## 2. Load the Dataset

Download the dataset from Kaggle.

In [None]:
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
data.head()

## 3. Preprocess the Data

Normalise the **Amount** column and drop the **Time** column because it is irrevalent in fraud detection.

In [None]:
# Every column is normalised except 'Amount'
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])
data = data.drop('Time', axis=1)

## 4. Train - Test Split

Split the dataset in training (80%) and testing (20%).

In [None]:
x_data = data.drop('Class', axis=1)
y_data = data['Class']

y_data = np.where(y_data == 1, -1, 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

## 5. Train the Model

**IsolationForest** is an unsupervised algorithm that isolates anomalies by randomly partitioning data.

* **contamination=0.0035**: Tells the model to expect a fraud rate of 0.35%.

In [None]:
# contamination = y_data.mean()

model = IsolationForest(contamination=0.0035)
model.fit(x_train)

## 6. Make Predications

Make the predictions and map them to binary labels.

In [None]:
y_predicted = model.predict(x_test)

# Map predictions: anomaly -> 1 and normal -> 0
y_predicted = np.where(y_predicted == -1, 1, 0)
y_test = np.where(y_test == -1, 1, 0)

## 7. Evaluate the model

### a. Confusion Matrix

Shows True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN).

In [None]:
print(confusion_matrix(y_test, y_predicted))

### b. Classification Report

Includes precision, recall, and F1-score.

* **Precision**: When the model says something is positive, how often is it actually correct?
$$
TP / (TP + FP)
$$

* **Recall**: How many of the actual positives did the model correctly find?
$$
TP / (TP + FN)
$$

* **F1-Score**: Can the model balance precision and recall, or is it sacrificing one for the other?
$$
2 * (Precision * Recall) / (Precision + Recall)
$$

* **Support**: How many real examples of each class are in the dataset?

In [None]:
print(classification_report(y_test, y_predicted))

### c. Number of Predictions of each class

Plot the number of predictions of each class made by the model.

In [None]:
sns.countplot(x=y_predicted)
plt.title('Predicted Normal vs. Fraud')
plt.xlabel('Predicted Label (0 = Normal, 1 = Fraud)')
plt.show()

### d. Anomaly Scores

Lower values from **decision_function** indicate higher anomaly likelihood. We invert the sign for compatibilty with oher metrics.

In [None]:
scores = -model.decision_function(x_test)
print(f"Anomalous Scores: {scores}")

### e. Average Precision Score

Summarizes the precision-recall curve, suitable for imbalanced data.

In [None]:
ap_score = average_precision_score(y_test, scores)
print(f"Average Precision Score: {ap_score}")

### f. Precision - Recall Curve

Shows the trade-off between precision and recall. High area under the curve (AP) indicates good performance.

In [None]:
precision, recall, _ = precision_recall_curve(y_test, scores)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

### g. F2-Score

Emphasizes recall over precision (weighted by beta=2), critical in fraud detection where missing fraud is costlier than false alerts.

In [None]:
f2 = fbeta_score(y_test, y_predicted, beta=2)
print(f"F2-Score: {f2}")

### h. ROC - AUC Score

Measures the model’s ability to distinguish between classes. Less informative than AP for imbalanced data but still useful.

In [None]:
roc_score = roc_auc_score(y_test, scores)
print(f"ROC Score: {roc_score}")

In [None]:
fpr, tpr, _ = roc_curve(y_test, scores)
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()