In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel

# Function to add Gaussian noise
def add_gaussian_noise(data, epsilon, delta, sensitivity):
    sigma = (sensitivity / epsilon) * np.sqrt(2 * np.log(1.25 / delta))
    noise = np.random.normal(0, sigma, size=data.shape)
    return data + noise

# Parameters for differential privacy
epsilon = 1  # Privacy budget
delta = 1e-5   
sensitivity = 0.5

# Load the data
data = pd.read_csv('mammography_dataset.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Class distribution before augmentation
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Train-test split for original data
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest on the original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict and calculate recall and F1 score on original data
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Loop over different values of n
n_values = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25]  # Example range of n values
results = []

for n in n_values:
    print(f"\nRunning for n = {n}...")

    # Forest Diffusion Model for data augmentation
    forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                         bin_indexes=[], cat_indexes=[], int_indexes=[],
                                         diffusion_type='flow', n_jobs=-1)

    # Generate synthetic data
    num_samples = round((len(X_minority)) * n)
    X_minority_fake = forest_model.generate(batch_size=num_samples)

    # Add Gaussian noise to synthetic data for differential privacy
    X_minority_fake_noisy = add_gaussian_noise(X_minority_fake, epsilon, delta, sensitivity)

    # Combine the noisy synthetic data with original data
    X_balanced = np.concatenate((X, X_minority_fake_noisy), axis=0)
    y_balanced = np.concatenate((y, np.ones(X_minority_fake_noisy.shape[0])), axis=0)

    # Class distribution after augmentation
    unique, counts = np.unique(y_balanced, return_counts=True)
    class_dist_after = dict(zip(unique, counts))
    print(f"Class distribution after augmentation: {class_dist_after}")

    # Train-test split for augmented data
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

    # Train a Random Forest on the augmented data
    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 score on augmented data
    y_pred_bal = clf_bal.predict(X_test)
    recall_bal = recall_score(y_test, y_pred_bal)
    f1_bal = f1_score(y_test, y_pred_bal)
    print(f"Recall score (generated data with DP, n={n}): {recall_bal:.4f}")
    print(f"F1 score (generated data with DP, n={n}): {f1_bal:.4f}")

    # Store results
    results.append((n, recall_bal, f1_bal))

# Print summary of results
print("\nSummary of Results:")
print("n\tRecall\tF1")
for n, recall, f1 in results:
    print(f"{n}\t{recall:.4f}\t{f1:.4f}")

### Noise is added in the latent space

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers


In [2]:
# Function to add Gaussian noise
def add_gaussian_noise(data, epsilon, delta, sensitivity):
    sigma = (sensitivity / epsilon) * np.sqrt(2 * np.log(1.25 / delta))
    noise = np.random.normal(0, sigma, size=data.shape)
    return data + noise

# Parameters for differential privacy
epsilon = 1  # Privacy budget
delta = 1e-5   
sensitivity = 0.5

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
# Main execution
dataset_list = ['reduced_diabetes','reduced_cardio_train','reduced_smart_grid_stability','mammography','oil','coil_2000']
for dataset in dataset_list:
    print(f"Running for dataset: {dataset}")
    file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
    data = pd.read_csv(file_path)

    # Preprocess entire dataset
    X_orig = data.iloc[:, :-1]
    y_orig = data.iloc[:, -1]

    # Process minority class
    real_minority = data[y_orig == 1]
    X_minority = real_minority.iloc[:, :-1]
    y_minority = real_minority.iloc[:, -1]

    # Preprocess minority data
    X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
    input_dim = X_processed.shape[1]
    # Initialize model
    latent_dim = 4
    num_epochs = 1000
    batch_size = 32
    model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
    optimizer = Adam(model.parameters(), lr=0.001)

    # Convert data to tensor
    X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for i in range(0, len(X_tensor), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            
            reconstructed = model(batch)
            loss = 0
            
            # Calculate per-feature loss
            start_idx = 0
            for j, (ftype, params) in enumerate(feature_info):
                if ftype == 'categorical':
                    end_idx = start_idx + params['num_classes']
                    loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                            batch[:, j].long())
                    start_idx = end_idx
                else:
                    end_idx = start_idx + 1
                    if ftype == 'binary':
                        loss += F.binary_cross_entropy_with_logits(
                            reconstructed[:, start_idx], batch[:, j])
                    else:
                        loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                    start_idx = end_idx
            
            loss.backward()
            optimizer.step()

    # Generate latent space
    with torch.no_grad():
        latent = model.encode(X_tensor).numpy()

    # Apply Forest Diffusion
    forest_model = ForestDiffusionModel(
        X=latent,
        n_t=50,
        duplicate_K=100,
        diffusion_type='flow',
        n_jobs=-1
    )
    # Training parameters
    multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
    # multiplier_list = [0.25]
    results = []


    for multiplier in multiplier_list:
        print(f"Results for multiplier={multiplier}")
        
        # Generate synthetic latent samples
        synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
        # Add Gaussian noise to synthetic data for differential privacy
        synthetic_latent = add_gaussian_noise(synthetic_latent, epsilon, delta, sensitivity)
        # Decode samples
        with torch.no_grad():
            synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
        
        # Convert to original feature space
        synthetic_df = pd.DataFrame()
        start_idx = 0
        for j, col in enumerate(X_minority.columns):
            ftype, params = feature_info[j]
            
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
                preds = torch.argmax(probs, dim=1).numpy()
                synthetic_df[col] = label_encoders[col].inverse_transform(preds)
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                    synthetic_df[col] = preds
                else:
                    vals = synthetic_tensor[:, start_idx].numpy()
                    synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
                start_idx = end_idx
        
        # Rest of the evaluation pipeline (same as original)
        synthetic_df[data.columns[-1]] = 1

        augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
        
        y_balanced = augmented_dataset.iloc[:, -1] 
        X_balanced= augmented_dataset.iloc[:, :-1]  
        X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
        X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)
        # Class distribution after augmentation
        unique, counts = np.unique(y_balanced, return_counts=True)
        class_dist_after = dict(zip(unique, counts))
        print(f"Class distribution after augmentation: {class_dist_after}")
        # Step 7: Train a simple classifier on both original and generated datasets
        # Train-test split for augmented data
        X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

        # Train a Random Forest on the augmented data
        clf_bal = RandomForestClassifier(random_state=42)
        clf_bal.fit(X_train_bal, y_train_bal)

        # Predict and calculate recall and F1 score on augmented data
        y_pred_bal = clf_bal.predict(X_test_orig)
        recall_bal = recall_score(y_test_orig, y_pred_bal)
        f1_bal = f1_score(y_test_orig, y_pred_bal)
        print(f"Recall score (generated data with DP, n={multiplier}): {recall_bal:.4f}")
        print(f"F1 score (generated data with DP, n={multiplier}): {f1_bal:.4f}")

        # Store results
        results.append((multiplier, recall_bal, f1_bal))

    # Print summary of results
    print("\nSummary of Results:")
    print("n\tRecall\tF1")
    for multiplier, recall, f1 in results:
        print(f"{multiplier}\t{recall:.4f}\t{f1:.4f}")

Running for dataset: reduced_diabetes




Results for multiplier=0.25
Class distribution after augmentation: {0: 500, 1: 75}
Recall score (generated data with DP, n=0.25): 0.8824
F1 score (generated data with DP, n=0.25): 0.9375
Results for multiplier=0.5
Class distribution after augmentation: {0: 500, 1: 90}
Recall score (generated data with DP, n=0.5): 0.9412
F1 score (generated data with DP, n=0.5): 0.9697
Results for multiplier=0.75
Class distribution after augmentation: {0: 500, 1: 105}
Recall score (generated data with DP, n=0.75): 0.8824
F1 score (generated data with DP, n=0.75): 0.9375
Results for multiplier=1
Class distribution after augmentation: {0: 500, 1: 120}
Recall score (generated data with DP, n=1): 0.9412
F1 score (generated data with DP, n=1): 0.9412
Results for multiplier=1.25
Class distribution after augmentation: {0: 500, 1: 135}
Recall score (generated data with DP, n=1.25): 0.9412
F1 score (generated data with DP, n=1.25): 0.9697
Results for multiplier=1.5
Class distribution after augmentation: {0: 500,



Results for multiplier=0.25
Class distribution after augmentation: {0: 35021, 1: 5000}
Recall score (generated data with DP, n=0.25): 0.7681
F1 score (generated data with DP, n=0.25): 0.8562
Results for multiplier=0.5
Class distribution after augmentation: {0: 35021, 1: 6000}
Recall score (generated data with DP, n=0.5): 0.7765
F1 score (generated data with DP, n=0.5): 0.8583
Results for multiplier=0.75
Class distribution after augmentation: {0: 35021, 1: 7000}
Recall score (generated data with DP, n=0.75): 0.7874
F1 score (generated data with DP, n=0.75): 0.8624
Results for multiplier=1
Class distribution after augmentation: {0: 35021, 1: 8000}
Recall score (generated data with DP, n=1): 0.7882
F1 score (generated data with DP, n=1): 0.8606
Results for multiplier=1.25
Class distribution after augmentation: {0: 35021, 1: 9000}
Recall score (generated data with DP, n=1.25): 0.7748
F1 score (generated data with DP, n=1.25): 0.8529
Results for multiplier=1.5
Class distribution after augme



Results for multiplier=0.25
Class distribution after augmentation: {0: 6380, 1: 1250}
Recall score (generated data with DP, n=0.25): 0.8344
F1 score (generated data with DP, n=0.25): 0.9097
Results for multiplier=0.5
Class distribution after augmentation: {0: 6380, 1: 1500}
Recall score (generated data with DP, n=0.5): 0.8742
F1 score (generated data with DP, n=0.5): 0.9279
Results for multiplier=0.75
Class distribution after augmentation: {0: 6380, 1: 1750}
Recall score (generated data with DP, n=0.75): 0.8808
F1 score (generated data with DP, n=0.75): 0.9333
Results for multiplier=1
Class distribution after augmentation: {0: 6380, 1: 2000}
Recall score (generated data with DP, n=1): 0.9007
F1 score (generated data with DP, n=1): 0.9379
Results for multiplier=1.25
Class distribution after augmentation: {0: 6380, 1: 2250}
Recall score (generated data with DP, n=1.25): 0.9007
F1 score (generated data with DP, n=1.25): 0.9396
Results for multiplier=1.5
Class distribution after augmentati



Results for multiplier=0.25
Class distribution after augmentation: {-1: 10923, 1: 325}
Recall score (generated data with DP, n=0.25): 0.8514
F1 score (generated data with DP, n=0.25): 0.9065
Results for multiplier=0.5
Class distribution after augmentation: {-1: 10923, 1: 390}
Recall score (generated data with DP, n=0.5): 0.8649
F1 score (generated data with DP, n=0.5): 0.9078
Results for multiplier=0.75
Class distribution after augmentation: {-1: 10923, 1: 455}
Recall score (generated data with DP, n=0.75): 0.8514
F1 score (generated data with DP, n=0.75): 0.8936
Results for multiplier=1
Class distribution after augmentation: {-1: 10923, 1: 520}
Recall score (generated data with DP, n=1): 0.8378
F1 score (generated data with DP, n=1): 0.8986
Results for multiplier=1.25
Class distribution after augmentation: {-1: 10923, 1: 585}
Recall score (generated data with DP, n=1.25): 0.9189
F1 score (generated data with DP, n=1.25): 0.9315
Results for multiplier=1.5
Class distribution after augme



Results for multiplier=0.25


ValueError: non-broadcastable output operand with shape (10,1) doesn't match the broadcast shape (10,2)

In [10]:
augmented_dataset[data.columns[-1]] = 1
augmented_dataset

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,Outcome
0,18393,2,168,62.0,110,80,1,1,0,0,1,1,
1,17474,1,156,56.0,100,60,1,1,0,0,0,1,
2,21914,1,151,67.0,120,80,2,2,0,0,0,1,
3,22113,1,157,93.0,130,80,3,1,0,0,1,1,
4,17668,1,158,71.0,110,70,1,1,0,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40016,22515,1,165,56.0,130,90,1,1,0,0,0,1,1.0
40017,19686,1,166,65.0,120,80,2,2,0,0,1,1,1.0
40018,19104,1,159,56.0,120,90,1,1,0,0,0,1,1.0
40019,17319,1,165,56.0,120,90,1,1,0,0,1,1,1.0


In [9]:
data

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,17474,1,156,56.0,100,60,1,1,0,0,0,0
2,21914,1,151,67.0,120,80,2,2,0,0,0,0
3,22113,1,157,93.0,130,80,3,1,0,0,1,0
4,17668,1,158,71.0,110,70,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
39016,20962,2,174,83.0,120,80,3,1,0,0,1,1
39017,21151,1,178,69.0,130,90,1,1,0,0,1,1
39018,17500,2,182,110.0,130,90,2,2,0,0,1,1
39019,21074,1,165,80.0,150,80,1,1,0,0,1,1
