#### MLP Diffusion Model with with CTGAN in the pipeline

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# Load the dataset
df = pd.read_csv('creditcard.csv')

# Plot the original data showing both classes
sns.countplot(x='Class', data=df)
plt.title("Original Dataset Class Distribution")
plt.show()

# Split data into features and labels
X = df.drop(columns=['Class'])
y = df['Class']

# Normalize the features
X = (X - X.mean()) / X.std()

# Convert to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define MLP Diffusion Model
class MLPDiffusion(nn.Module):
    def __init__(self, input_dim):
        super(MLPDiffusion, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, input_dim)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Instantiate and train the diffusion model
input_dim = X_tensor.shape[1]
mlp_diffusion = MLPDiffusion(input_dim)
optimizer = optim.Adam(mlp_diffusion.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Train the model
epochs = 10
for epoch in range(epochs):
    for batch_X, _ in dataloader:
        optimizer.zero_grad()
        output = mlp_diffusion(batch_X)
        loss = criterion(output, batch_X)
        loss.backward()
        optimizer.step()

# Generate 200 synthetic samples using the trained diffusion model
generated_data = mlp_diffusion(torch.randn(200, input_dim)).detach().numpy()
generated_df = pd.DataFrame(generated_data, columns=X.columns)

# Define metadata for the CTGAN synthesizer
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='creditcard.csv')

# Train CTGAN on the generated data using sdv
synthesizer = CTGANSynthesizer(
    metadata,
    enforce_rounding=False,
    epochs=500,
    verbose=True
)

synthesizer.fit(generated_df)

# Generate 500 more synthetic samples using CTGAN
ctgan_generated_data = synthesizer.sample(num_rows=500)
ctgan_generated_df = pd.DataFrame(ctgan_generated_data, columns=X.columns)

# Merge the original dataset with generated data
augmented_df = pd.concat([df, generated_df, ctgan_generated_df], ignore_index=True)

# Plot the augmented data showing both classes
sns.countplot(x='Class', data=augmented_df)
plt.title("Augmented Dataset Class Distribution")
plt.show()

# Split augmented data into features and labels
X_augmented = augmented_df.drop(columns=['Class'])
y_augmented = augmented_df['Class']

# Random Forest classification on original data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Classification Report on Original Data:")
print(classification_report(y_test, y_pred))

# Random Forest classification on augmented data
rf2= RandomForestClassifier(n_estimators=100, random_state=42)
rf2.fit(X_augmented, y_augmented)
y_aug_pred = rf.predict(X_test)
print("Classification Report on Augmented Data:")
print(classification_report(y_aug_test, y_aug_pred))
