In [None]:
# Step 1: Collect Real Fraud Data
# Use a publicly available fraud detection dataset. 
# For this example, we’ll use the Credit Card Fraud Detection Dataset from Kaggle.
import pandas as pd

# Load the dataset
data = pd.read_csv("creditcard.csv")
print(data.head())

In [3]:
# Step 2: Preprocess the Data
# Preprocess the data by normalizing it and splitting it into features and labels.

from sklearn.preprocessing import StandardScaler

# Separate features and labels
X = data.drop("Class", axis=1)
y = data["Class"]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Train a GAN for Synthetic Fraud Data Generation
# Train a GAN to generate synthetic fraud data.

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Input
from tensorflow.keras.models import Model, Sequential

# Define the GAN generator
def build_generator():
    model = Sequential([
        Dense(128, input_dim=100),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(256),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(X_train.shape[1], activation='tanh')  # Output layer matches feature size
    ])
    return model

# Define the GAN discriminator
def build_discriminator():
    model = Sequential([
        Dense(128, input_dim=X_train.shape[1]),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')  # Binary classification (real/fake)
    ])
    return model

# Combine the GAN
generator = build_generator()
discriminator = build_discriminator()

discriminator.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.0002, 0.5))
discriminator.trainable = False

gan_input = Input(shape=(100,))
fake_data = generator(gan_input)
gan_output = discriminator(fake_data)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.0002, 0.5))

# Train the GAN
epochs = 10000
batch_size = 32
for epoch in range(epochs):
    noise = np.random.normal(0, 1, (batch_size, 100))
    fake_data = generator.predict(noise)
    real_data = X_train[np.random.randint(0, X_train.shape[0], batch_size)]
    discriminator_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
    discriminator_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((batch_size, 1)))
    discriminator_loss = 0.5 * np.add(discriminator_loss_real, discriminator_loss_fake)
    generator_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")

In [None]:
# Step 4: Combine Real and Synthetic Data
# Generate synthetic fraud data and combine it with the real dataset.
# Generate synthetic fraud data
noise = np.random.normal(0, 1, (len(X_train), 100))
synthetic_fraud_data = generator.predict(noise)

# Combine real and synthetic data
X_augmented = np.vstack([X_train, synthetic_fraud_data])
y_augmented = np.hstack([y_train, np.ones(len(synthetic_fraud_data))])  # Label synthetic data as fraud (1)

In [None]:
# Step 5: Train a Fraud Detection Model
# Train a machine learning model (e.g., Random Forest) on the augmented dataset.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_augmented, y_augmented)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Step 6: Evaluate the Model
# Analyze the model’s performance using metrics like precision, recall, and F1-score.

from sklearn.metrics import confusion_matrix, precision_recall_curve
import seaborn as sns

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Fraud", "Fraud"], yticklabels=["Non-Fraud", "Fraud"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker=".")
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

In [None]:
# Use SHAP (SHapley Additive exPlanations) or feature importance plots to explain which features contribute most to fraud detection.
import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=data.columns[:-1])