In [18]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim*2, hidden_dim)
        )
        # Separate decoders for features and outcome
        self.decoder_features = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim*2, input_dim-1)
        )
        self.decoder_outcome = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim*2, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded_features = self.decoder_features(encoded)
        decoded_outcome = self.decoder_outcome(encoded)
        return torch.cat([decoded_features, decoded_outcome], dim=1)
    
    def generate_synthetic_data(self, num_samples):
        device = next(self.parameters()).device
        with torch.no_grad():
            z = torch.randn(num_samples, self.encoder[-1].out_features, device=device)
            features = self.decoder_features(z)
            outcome = self.decoder_outcome(z)
            synthetic = torch.cat([features, outcome], dim=1)
            # Round outcome
            synthetic[:, -1] = torch.round(synthetic[:, -1])
        return synthetic.cpu().numpy()

def train_autoencoder(data, input_dim, hidden_dim, epochs=20, batch_size=32, lr=0.001):
    data_min = data.min()
    data_max = data.max()
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    data_tensor = torch.FloatTensor(scaled_data)
    
    model = Autoencoder(input_dim, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i:i+batch_size]
            
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data_tensor):.4f}")
    
    return model, scaler, data_min, data_max

def scale_to_original_range(synthetic_data, original_min, original_max):
    """Scale synthetic data to match original data ranges"""
    df_synthetic = pd.DataFrame(synthetic_data)
    for col in df_synthetic.columns[:-1]:  # Skip outcome column
        min_val = original_min[col]
        max_val = original_max[col]
        df_synthetic[col] = df_synthetic[col].clip(lower=min_val, upper=max_val)
    # Ensure outcome is binary
    df_synthetic.iloc[:, -1] = df_synthetic.iloc[:, -1].round().clip(0, 1)
    return df_synthetic.values


# Example usage
if __name__ == "__main__":
    # Load sample tabular data
    path = '../Datasets/diabetes.csv'
    df = pd.read_csv(path)

    input_dim = df.shape[1]  # Get number of features
    hidden_dim = 5  # Latent space dimension

    # Train the autoencoder with all return values
    autoencoder, scaler, data_min, data_max = train_autoencoder(df, input_dim, hidden_dim, epochs=1000)

    # Generate synthetic data
    num_synthetic_instances = 100
    synthetic_data = autoencoder.generate_synthetic_data(num_synthetic_instances)
    
    # Inverse transform the synthetic data
    synthetic_data = scaler.inverse_transform(synthetic_data)
    
    # Scale to original ranges
    synthetic_data = scale_to_original_range(synthetic_data, data_min, data_max)
    
    # Convert synthetic data to dataframe
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)

    print("\nGenerated Synthetic Data:")
    print(synthetic_df.head())
    
    # Print ranges comparison
    print("\nOriginal Data Ranges:")
    print(df.agg(['min', 'max']))
    print("\nSynthetic Data Ranges:")
    print(synthetic_df.agg(['min', 'max']))

Epoch [100/1000], Loss: 0.0069
Epoch [200/1000], Loss: 0.0065
Epoch [300/1000], Loss: 0.0063
Epoch [400/1000], Loss: 0.0063
Epoch [500/1000], Loss: 0.0060
Epoch [600/1000], Loss: 0.0057
Epoch [700/1000], Loss: 0.0056
Epoch [800/1000], Loss: 0.0056
Epoch [900/1000], Loss: 0.0056
Epoch [1000/1000], Loss: 0.0055

Generated Synthetic Data:
   Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
0     4.137497  129.704895      64.045044      24.056353   99.633102   
1     6.095989  103.327438      67.251556      24.571205   63.307220   
2     0.000000  171.529541      55.871586       0.000000  413.077667   
3     0.806957  139.542511      47.475578       4.704753  264.075989   
4     0.383999  122.763748      65.841385      22.157076   63.391430   

         BMI  DiabetesPedigreeFunction        Age  Outcome  
0  33.584423                     0.078  31.330311      1.0  
1  43.038231                     0.078  39.475033      0.0  
2  29.430025                     0.078  26.267

  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]


In [19]:
synthetic_df.to_csv('diabetes_autoencoder.csv', index=False) 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score

real_df = df
augmented_df = pd.concat([real_df, synthetic_df], ignore_index=True)

X = real_df.iloc[:, :-1].values  # Features
y = real_df.iloc[:, -1].values 
# Check and print the original class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")# Labels (binary classification)

X_balanced = augmented_df.iloc[:, :-1].values  # Features
y_balanced = augmented_df.iloc[:, -1].values  # Labels (binary classification)

# Step 6: Split the dataset into training and test sets (original and balanced)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

# Step 7: Train a simple classifier on both original and generated datasets
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Step 8: Predict and calculate recall and F1 scores
y_pred_orig = clf_orig.predict(X_test_orig)
y_pred_bal = clf_bal.predict(X_test_orig)

recall_orig = recall_score(y_test_orig, y_pred_orig)
recalls_bal = recall_score(y_test_orig, y_pred_bal)

f1_orig = f1_score(y_test_orig, y_pred_orig)
f1_bal = f1_score(y_test_orig, y_pred_bal)

# Step 9: Print the performance metrics
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"Recall score (generated data): {recalls_bal:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")
print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


print(f"Number of fake samples generated: {len(augmented_df)-len(real_df)}")

Class distribution before augmentation: {0: 500, 1: 268}
Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.6625
Recall score (generated data): 0.8375
F1 score (original data): 0.6503
F1 score (generated data): 0.7976
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.91      0.86      0.88       151
           1       0.76      0.84      0.80        80

    accuracy                           0.85       231
   macro avg       0.84      0.85      0.84       231
weighted avg       0.86      0.85      0.85       231

Number of

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler


class AutoencoderWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads=1):
        super().__init__()
        self.input_dim = input_dim

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim)
        )

        # Attention layer
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads, batch_first=True)

        # Decoders
        self.decoder_features = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, input_dim - 1)
        )
        self.decoder_outcome = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        # Encode
        encoded = self.encoder(x)
        # Reshape for attention layer
        encoded = encoded.unsqueeze(1)  # Shape: [batch_size, seq_len=1, hidden_dim]
        # Apply attention
        attn_output, _ = self.attention(encoded, encoded, encoded)
        attn_output = attn_output.squeeze(1)  # Shape: [batch_size, hidden_dim]
        # Decode
        decoded_features = self.decoder_features(attn_output)
        decoded_outcome = self.decoder_outcome(attn_output)
        return torch.cat([decoded_features, decoded_outcome], dim=1)

def train_autoencoder(data, input_dim, hidden_dim, epochs=20, batch_size=32, lr=0.001):
    data_min = data.min()
    data_max = data.max()
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    data_tensor = torch.FloatTensor(scaled_data)
    
    model = Autoencoder(input_dim, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i:i+batch_size]
            
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data_tensor):.4f}")
    
    return model, scaler, data_min, data_max

def scale_to_original_range(synthetic_data, original_min, original_max):
    """Scale synthetic data to match original data ranges"""
    df_synthetic = pd.DataFrame(synthetic_data)
    for col in df_synthetic.columns[:-1]:  # Skip outcome column
        min_val = original_min[col]
        max_val = original_max[col]
        df_synthetic[col] = df_synthetic[col].clip(lower=min_val, upper=max_val)
    # Ensure outcome is binary
    df_synthetic.iloc[:, -1] = df_synthetic.iloc[:, -1].round().clip(0, 1)
    return df_synthetic.values


# Example usage
if __name__ == "__main__":
    # Load sample tabular data
    path = '../Datasets/diabetes.csv'
    df = pd.read_csv(path)

    input_dim = df.shape[1]  # Get number of features
    hidden_dim = 5  # Latent space dimension

    # Train the autoencoder with all return values
    autoencoder, scaler, data_min, data_max = train_autoencoder(df, input_dim, hidden_dim, epochs=1000)

    # Generate synthetic data
    num_synthetic_instances = 100
    synthetic_data = autoencoder.generate_synthetic_data(num_synthetic_instances)
    
    # Inverse transform the synthetic data
    synthetic_data = scaler.inverse_transform(synthetic_data)
    
    # Scale to original ranges
    synthetic_data = scale_to_original_range(synthetic_data, data_min, data_max)
    
    # Convert synthetic data to dataframe
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)

    print("\nGenerated Synthetic Data:")
    print(synthetic_df.head())
    
    # Print ranges comparison
    print("\nOriginal Data Ranges:")
    print(df.agg(['min', 'max']))
    print("\nSynthetic Data Ranges:")
    print(synthetic_df.agg(['min', 'max']))

Epoch [100/1000], Loss: 0.0066
Epoch [200/1000], Loss: 0.0058
Epoch [300/1000], Loss: 0.0054
Epoch [400/1000], Loss: 0.0052
Epoch [500/1000], Loss: 0.0051
Epoch [600/1000], Loss: 0.0051
Epoch [700/1000], Loss: 0.0050
Epoch [800/1000], Loss: 0.0050
Epoch [900/1000], Loss: 0.0050
Epoch [1000/1000], Loss: 0.0050

Generated Synthetic Data:
   Pregnancies     Glucose  BloodPressure  SkinThickness    Insulin  \
0          0.0  190.954681      71.938553      12.902526   9.399074   
1          0.0  159.229172      68.740921      21.334652   6.453776   
2          0.0  158.607544      76.263916       3.243194  14.481685   
3          0.0  131.857071      80.641312       4.763150  13.026881   
4          0.0  143.806610      82.653435       2.294254   9.106364   

         BMI  DiabetesPedigreeFunction        Age  Outcome  
0  28.166277                  0.252532  21.000000      1.0  
1  26.751492                  0.506460  21.509159      1.0  
2  27.542755                  0.078000  21.000000   

  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]
  min_val = original_min[col]
  max_val = original_max[col]


In [26]:
synthetic_df.to_csv('diabetes_autoencoder_attention.csv', index=False) 

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score

real_df = df
augmented_df = pd.concat([real_df, synthetic_df], ignore_index=True)

X = real_df.iloc[:, :-1].values  # Features
y = real_df.iloc[:, -1].values 
# Check and print the original class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")# Labels (binary classification)

X_balanced = augmented_df.iloc[:, :-1].values  # Features
y_balanced = augmented_df.iloc[:, -1].values  # Labels (binary classification)

# Step 6: Split the dataset into training and test sets (original and balanced)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

# Step 7: Train a simple classifier on both original and generated datasets
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Step 8: Predict and calculate recall and F1 scores
y_pred_orig = clf_orig.predict(X_test_orig)
y_pred_bal = clf_bal.predict(X_test_orig)

recall_orig = recall_score(y_test_orig, y_pred_orig)
recalls_bal = recall_score(y_test_orig, y_pred_bal)

f1_orig = f1_score(y_test_orig, y_pred_orig)
f1_bal = f1_score(y_test_orig, y_pred_bal)

# Step 9: Print the performance metrics
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"Recall score (generated data): {recalls_bal:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")
print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


print(f"Number of fake samples generated: {len(augmented_df)-len(real_df)}")

Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.6625
Recall score (generated data): 0.8125
F1 score (original data): 0.6503
F1 score (generated data): 0.7784
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.90      0.85      0.87       151
           1       0.75      0.81      0.78        80

    accuracy                           0.84       231
   macro avg       0.82      0.83      0.83       231
weighted avg       0.84      0.84      0.84       231

Number of fake samples generated: 100
