In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Define the Attention Mechanism
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Scaled dot-product attention
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.size(-1) ** 0.5)
        attention_weights = self.softmax(attention_scores)
        attention_output = torch.matmul(attention_weights, V)
        return attention_output


# Define the Autoencoder with Attention
class AutoencoderWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim

        # Encoder with attention
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim)
        )
        self.attention = SelfAttention(hidden_dim)

        # Separate decoders for features and outcome
        self.decoder_features = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, input_dim - 1)
        )
        self.decoder_outcome = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # Encode, apply attention, and decode
        encoded = self.encoder(x)
        attended = self.attention(encoded.unsqueeze(1)).squeeze(1)  # Add and remove batch dimension for attention
        decoded_features = self.decoder_features(attended)
        decoded_outcome = self.decoder_outcome(attended)
        return torch.cat([decoded_features, decoded_outcome], dim=1)

    def generate_synthetic_data(self, num_samples):
        device = next(self.parameters()).device
        with torch.no_grad():
            z = torch.randn(num_samples, self.encoder[-1].out_features, device=device)
            attended = self.attention(z.unsqueeze(1)).squeeze(1)
            features = self.decoder_features(attended)
            outcome = self.decoder_outcome(attended)
            synthetic = torch.cat([features, outcome], dim=1)
            # Round outcome
            synthetic[:, -1] = torch.round(synthetic[:, -1])
        return synthetic.cpu().numpy()


# Training Function
def train_autoencoder(data, input_dim, hidden_dim, epochs=20, batch_size=32, lr=0.001):
    data_min = data.min()
    data_max = data.max()

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    data_tensor = torch.FloatTensor(scaled_data)

    model = AutoencoderWithAttention(input_dim, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i:i+batch_size]

            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(data_tensor):.4f}")

    return model, scaler, data_min, data_max


# Scale synthetic data back to original range
def scale_to_original_range(synthetic_data, original_min, original_max):
    """Scale synthetic data to match original data ranges"""
    df_synthetic = pd.DataFrame(synthetic_data)
    for col in df_synthetic.columns[:-1]:  # Skip outcome column
        min_val = original_min[col]
        max_val = original_max[col]
        df_synthetic[col] = df_synthetic[col].clip(lower=min_val, upper=max_val)
    # Ensure outcome is binary
    df_synthetic.iloc[:, -1] = df_synthetic.iloc[:, -1].round().clip(0, 1)
    return df_synthetic.values


# Example usage
if __name__ == "__main__":
    # Load sample tabular data
    path = '../Datasets/diabetes.csv'
    df = pd.read_csv(path)

    input_dim = df.shape[1]  # Get number of features
    hidden_dim = 5  # Latent space dimension

    # Train the autoencoder with attention
    autoencoder, scaler, data_min, data_max = train_autoencoder(df, input_dim, hidden_dim, epochs=1000)

    # Generate synthetic data
    num_synthetic_instances = 100
    synthetic_data = autoencoder.generate_synthetic_data(num_synthetic_instances)

    # Inverse transform the synthetic data
    synthetic_data = scaler.inverse_transform(synthetic_data)

    # Scale to original ranges
    synthetic_data = scale_to_original_range(synthetic_data, data_min, data_max)

    # Convert synthetic data to dataframe
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)

    print("\nGenerated Synthetic Data:")
    print(synthetic_df.head())

    # Print ranges comparison
    print("\nOriginal Data Ranges:")
    print(df.agg(['min', 'max']))
    print("\nSynthetic Data Ranges:")
    print(synthetic_df.agg(['min', 'max']))


Epoch [100/1000], Loss: 0.0089
Epoch [200/1000], Loss: 0.0082
Epoch [300/1000], Loss: 0.0070
Epoch [400/1000], Loss: 0.0057
Epoch [500/1000], Loss: 0.0054
Epoch [600/1000], Loss: 0.0053
Epoch [700/1000], Loss: 0.0052
Epoch [800/1000], Loss: 0.0051
Epoch [900/1000], Loss: 0.0051
Epoch [1000/1000], Loss: 0.0050

Generated Synthetic Data:
   Pregnancies     Glucose  BloodPressure  SkinThickness    Insulin  \
0     2.693395  188.197388      81.042862      36.024353   2.040438   
1     2.344492  143.005814      78.279419      41.532654  66.812958   
2     5.678009  199.000000      35.411930      31.237715   0.000000   
3     4.492698  159.829376      78.797722      20.205566   0.000000   
4     1.718907  182.068207      55.836700      30.258989   0.027148   

         BMI  DiabetesPedigreeFunction        Age  Outcome  
0  40.926815                  1.042608  30.895632      1.0  
1  39.627724                  1.664703  38.121571      1.0  
2  43.193768                  1.061494  21.000000   

  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]


In [2]:
synthetic_df.to_csv('diabetes_autoencoder_attention.csv', index=False) 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score

real_df = df
augmented_df = pd.concat([real_df, synthetic_df], ignore_index=True)

X = real_df.iloc[:, :-1].values  # Features
y = real_df.iloc[:, -1].values 
# Check and print the original class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")# Labels (binary classification)

X_balanced = augmented_df.iloc[:, :-1].values  # Features
y_balanced = augmented_df.iloc[:, -1].values  # Labels (binary classification)

# Step 6: Split the dataset into training and test sets (original and balanced)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

# Step 7: Train a simple classifier on both original and generated datasets
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Step 8: Predict and calculate recall and F1 scores
y_pred_orig = clf_orig.predict(X_test_orig)
y_pred_bal = clf_bal.predict(X_test_orig)

recall_orig = recall_score(y_test_orig, y_pred_orig)
recalls_bal = recall_score(y_test_orig, y_pred_bal)

f1_orig = f1_score(y_test_orig, y_pred_orig)
f1_bal = f1_score(y_test_orig, y_pred_bal)

# Step 9: Print the performance metrics
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"Recall score (generated data): {recalls_bal:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")
print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


print(f"Number of fake samples generated: {len(augmented_df)-len(real_df)}")

Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.6625
Recall score (generated data): 0.8000
F1 score (original data): 0.6503
F1 score (generated data): 0.7805
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       151
           1       0.76      0.80      0.78        80

    accuracy                           0.84       231
   macro avg       0.83      0.83      0.83       231
weighted avg       0.85      0.84      0.85       231

Number of fake samples generated: 100
