#### Reduced Diabetes

In [15]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'reduced_diabetes'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32

for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    # Initialize model
    model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
    optimizer = Adam(model.parameters(), lr=0.001)
    
    # Convert data to tensor
    X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for i in range(0, len(X_tensor), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            
            reconstructed = model(batch)
            loss = 0
            
            # Calculate per-feature loss
            start_idx = 0
            for j, (ftype, params) in enumerate(feature_info):
                if ftype == 'categorical':
                    end_idx = start_idx + params['num_classes']
                    loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                          batch[:, j].long())
                    start_idx = end_idx
                else:
                    end_idx = start_idx + 1
                    if ftype == 'binary':
                        loss += F.binary_cross_entropy_with_logits(
                            reconstructed[:, start_idx], batch[:, j])
                    else:
                        loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                    start_idx = end_idx
            
            loss.backward()
            optimizer.step()
    
    # Generate latent space
    with torch.no_grad():
        latent = model.encode(X_tensor).numpy()
    
    # Apply Forest Diffusion
    forest_model = ForestDiffusionModel(
        X=latent,
        n_t=50,
        duplicate_K=100,
        diffusion_type='flow',
        n_jobs=-1
    )
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['Outcome'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    
    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))

Results for multiplier=0.25




Recall score (original data): 0.1765
Recall score (generated data): 0.6471
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       151
           1       0.92      0.65      0.76        17

    accuracy                           0.96       168
   macro avg       0.94      0.82      0.87       168
weighted avg       0.96      0.96      0.96       168

Results for multiplier=0.5




Recall score (original data): 0.1765
Recall score (generated data): 0.5294
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       151
           1       0.82      0.53      0.64        17

    accuracy                           0.94       168
   macro avg       0.88      0.76      0.81       168
weighted avg       0.94      0.94      0.93       168

Results for multiplier=0.75




Recall score (original data): 0.1765
Recall score (generated data): 0.8824
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       151
           1       0.88      0.88      0.88        17

    accuracy                           0.98       168
   macro avg       0.93      0.93      0.93       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=1




Recall score (original data): 0.1765
Recall score (generated data): 0.8235
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       151
           1       0.88      0.82      0.85        17

    accuracy                           0.97       168
   macro avg       0.93      0.91      0.92       168
weighted avg       0.97      0.97      0.97       168

Results for multiplier=1.25




Recall score (original data): 0.1765
Recall score (generated data): 1.0000
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      0.97      0.99       151
           1       0.81      1.00      0.89        17

    accuracy                           0.98       168
   macro avg       0.90      0.99      0.94       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=1.5




Recall score (original data): 0.1765
Recall score (generated data): 0.9412
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       151
           1       0.94      0.94      0.94        17

    accuracy                           0.99       168
   macro avg       0.97      0.97      0.97       168
weighted avg       0.99      0.99      0.99       168

Results for multiplier=1.75




Recall score (original data): 0.1765
Recall score (generated data): 0.8824
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       151
           1       0.88      0.88      0.88        17

    accuracy                           0.98       168
   macro avg       0.93      0.93      0.93       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=2




Recall score (original data): 0.1765
Recall score (generated data): 0.9412
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       151
           1       0.89      0.94      0.91        17

    accuracy                           0.98       168
   macro avg       0.94      0.96      0.95       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=2.25




Recall score (original data): 0.1765
Recall score (generated data): 0.9412
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       151
           1       0.84      0.94      0.89        17

    accuracy                           0.98       168
   macro avg       0.92      0.96      0.94       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=2.5




Recall score (original data): 0.1765
Recall score (generated data): 1.0000
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       151
           1       0.85      1.00      0.92        17

    accuracy                           0.98       168
   macro avg       0.93      0.99      0.95       168
weighted avg       0.98      0.98      0.98       168

Results for multiplier=2.75




Recall score (original data): 0.1765
Recall score (generated data): 1.0000
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       151
           1       0.77      1.00      0.87        17

    accuracy                           0.97       168
   macro avg       0.89      0.98      0.93       168
weighted avg       0.98      0.97      0.97       168

Results for multiplier=3




Recall score (original data): 0.1765
Recall score (generated data): 0.9412
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       151
           1       0.80      0.94      0.86        17

    accuracy                           0.97       168
   macro avg       0.90      0.96      0.92       168
weighted avg       0.97      0.97      0.97       168



#### Mammography

In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'mammography'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32
# Initialize model
model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
optimizer = Adam(model.parameters(), lr=0.001)

# Convert data to tensor
X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        optimizer.zero_grad()
        
        reconstructed = model(batch)
        loss = 0
        
        # Calculate per-feature loss
        start_idx = 0
        for j, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                        batch[:, j].long())
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    loss += F.binary_cross_entropy_with_logits(
                        reconstructed[:, start_idx], batch[:, j])
                else:
                    loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                start_idx = end_idx
        
        loss.backward()
        optimizer.step()

# Generate latent space
with torch.no_grad():
    latent = model.encode(X_tensor).numpy()

# Apply Forest Diffusion
forest_model = ForestDiffusionModel(
    X=latent,
    n_t=50,
    duplicate_K=100,
    diffusion_type='flow',
    n_jobs=-1
)
for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['target'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    # Replace -1 with 0 in the target variables
    y_orig = y_orig.replace(-1, 0)
    y_balanced = y_balanced.replace(-1, 0)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


    # Replace -1 with 0 in the target variables
    y_orig = y_orig.replace(-1, 0)
    y_balanced = y_balanced.replace(-1, 0)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Train an SVM classifier
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))




Results for multiplier=0.25
Recall score (original data): 0.5946
Recall score (generated data): 0.7027
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3281
           1       0.92      0.59      0.72        74

    accuracy                           0.99      3355
   macro avg       0.95      0.80      0.86      3355
weighted avg       0.99      0.99      0.99      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.99      1.00      1.00      3281
           1       0.96      0.70      0.81        74

    accuracy                           0.99      3355
   macro avg       0.98      0.85      0.90      3355
weighted avg       0.99      0.99      0.99      3355

Results for multiplier=0.5
Recall score (original data): 0.5946
Recall score (generated data): 0.7297
Classification Report (original data):
               precis

#### Reduced smart grid stability

In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'reduced_smart_grid_stability'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32
# Initialize model
model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
optimizer = Adam(model.parameters(), lr=0.001)

# Convert data to tensor
X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        optimizer.zero_grad()
        
        reconstructed = model(batch)
        loss = 0
        
        # Calculate per-feature loss
        start_idx = 0
        for j, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                        batch[:, j].long())
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    loss += F.binary_cross_entropy_with_logits(
                        reconstructed[:, start_idx], batch[:, j])
                else:
                    loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                start_idx = end_idx
        
        loss.backward()
        optimizer.step()

# Generate latent space
with torch.no_grad():
    latent = model.encode(X_tensor).numpy()

# Apply Forest Diffusion
forest_model = ForestDiffusionModel(
    X=latent,
    n_t=50,
    duplicate_K=100,
    diffusion_type='flow',
    n_jobs=-1
)
for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['stabf'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))




Results for multiplier=0.25
Recall score (original data): 0.4338
Recall score (generated data): 0.8742
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      1912
           1       0.96      0.43      0.60       302

    accuracy                           0.92      2214
   macro avg       0.94      0.72      0.78      2214
weighted avg       0.92      0.92      0.91      2214

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1912
           1       0.97      0.87      0.92       302

    accuracy                           0.98      2214
   macro avg       0.97      0.93      0.95      2214
weighted avg       0.98      0.98      0.98      2214

Results for multiplier=0.5
Recall score (original data): 0.4338
Recall score (generated data): 0.8874
Classification Report (original data):
               precis

#### Reduced Cardio Train

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'reduced_cardio_train'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32
# Initialize model
model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
optimizer = Adam(model.parameters(), lr=0.001)

# Convert data to tensor
X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        optimizer.zero_grad()
        
        reconstructed = model(batch)
        loss = 0
        
        # Calculate per-feature loss
        start_idx = 0
        for j, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                        batch[:, j].long())
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    loss += F.binary_cross_entropy_with_logits(
                        reconstructed[:, start_idx], batch[:, j])
                else:
                    loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                start_idx = end_idx
        
        loss.backward()
        optimizer.step()

# Generate latent space
with torch.no_grad():
    latent = model.encode(X_tensor).numpy()

# Apply Forest Diffusion
forest_model = ForestDiffusionModel(
    X=latent,
    n_t=50,
    duplicate_K=100,
    diffusion_type='flow',
    n_jobs=-1
)
for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['cardio'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))




Results for multiplier=0.25
Recall score (original data): 0.0840
Recall score (generated data): 0.7067
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     10517
           1       0.37      0.08      0.14      1190

    accuracy                           0.89     11707
   macro avg       0.64      0.53      0.54     11707
weighted avg       0.85      0.89      0.86     11707

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     10517
           1       0.83      0.71      0.76      1190

    accuracy                           0.96     11707
   macro avg       0.90      0.84      0.87     11707
weighted avg       0.95      0.96      0.95     11707

Results for multiplier=0.5
Recall score (original data): 0.0840
Recall score (generated data): 0.7269
Classification Report (original data):
               precis

#### coil_2000

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation
# Update the preprocess_data function to handle numerical features individually
def preprocess_data(df):
    feature_info = []
    label_encoders = {}
    scalers = {}  # Store scalers for each numerical column
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            # Categorical feature
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            # Binary feature
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            # Numerical feature
            scaler = MinMaxScaler()
            df[col] = scaler.fit_transform(df[[col]])
            scalers[col] = scaler  # Store scaler for this column
            feature_info.append(('numerical', None))
    
    return df, feature_info, label_encoders, scalers
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers
def inverse_transform_numerical(synthetic_tensor, feature_info, scalers, start_idx):
    col_name = feature_info[start_idx][0]  # Get the column name
    if col_name in scalers:
        vals = synthetic_tensor[:, start_idx].numpy()
        return scalers[col_name].inverse_transform(vals.reshape(-1, 1)).flatten()
    else:
        raise ValueError(f"Scaler not found for column: {col_name}")
# Main execution
dataset = 'coil_2000'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32

for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    # Initialize model
    model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
    optimizer = Adam(model.parameters(), lr=0.001)
    
    # Convert data to tensor
    X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for i in range(0, len(X_tensor), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            
            reconstructed = model(batch)
            loss = 0
            
            # Calculate per-feature loss
            start_idx = 0
            for j, (ftype, params) in enumerate(feature_info):
                if ftype == 'categorical':
                    end_idx = start_idx + params['num_classes']
                    loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                          batch[:, j].long())
                    start_idx = end_idx
                else:
                    end_idx = start_idx + 1
                    if ftype == 'binary':
                        loss += F.binary_cross_entropy_with_logits(
                            reconstructed[:, start_idx], batch[:, j])
                    else:
                        loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                    start_idx = end_idx
            
            loss.backward()
            optimizer.step()
    
    # Generate latent space
    with torch.no_grad():
        latent = model.encode(X_tensor).numpy()
    
    # Apply Forest Diffusion
    forest_model = ForestDiffusionModel(
        X=latent,
        n_t=50,
        duplicate_K=100,
        diffusion_type='flow',
        n_jobs=-1
    )
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        elif ftype == 'binary':
            end_idx = start_idx + 1
            preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
            synthetic_df[col] = preds
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            synthetic_df[col] = inverse_transform_numerical(synthetic_tensor, feature_info, scalers, start_idx)
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['target'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



Results for multiplier=0.25




#### Oil

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'oil'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32
# Initialize model
model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
optimizer = Adam(model.parameters(), lr=0.001)

# Convert data to tensor
X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        optimizer.zero_grad()
        
        reconstructed = model(batch)
        loss = 0
        
        # Calculate per-feature loss
        start_idx = 0
        for j, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                        batch[:, j].long())
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    loss += F.binary_cross_entropy_with_logits(
                        reconstructed[:, start_idx], batch[:, j])
                else:
                    loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                start_idx = end_idx
        
        loss.backward()
        optimizer.step()

# Generate latent space
with torch.no_grad():
    latent = model.encode(X_tensor).numpy()

# Apply Forest Diffusion
forest_model = ForestDiffusionModel(
    X=latent,
    n_t=50,
    duplicate_K=100,
    diffusion_type='flow',
    n_jobs=-1
)
for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['target'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_orig = y_orig.replace(-1, 0)
    y_balanced = y_balanced.replace(-1, 0)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))

    # Replace -1 with 0 in the target variables
    y_orig = y_orig.replace(-1, 0)
    y_balanced = y_balanced.replace(-1, 0)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Train an SVM classifier
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))




Results for multiplier=0.25


ValueError: non-broadcastable output operand with shape (10,1) doesn't match the broadcast shape (10,2)

Credit card

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier

# Transformer Autoencoder with separate feature handling
class TabularTransformerAE(nn.Module):
    def __init__(self, feature_info, embed_dim=8, latent_dim=32):
        super().__init__()
        self.feature_info = feature_info
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        
        # Feature processing modules
        self.embeddings = nn.ModuleDict()
        self.proj_binary = nn.Linear(1, embed_dim)
        self.proj_numerical = nn.Linear(1, embed_dim)
        
        # Create embeddings for categorical features
        for i, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                self.embeddings[f'emb_{i}'] = nn.Embedding(params['num_classes'], embed_dim)
                
        # Transformer encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Latent projection
        self.num_features = len(feature_info)
        self.latent_proj = nn.Linear(self.num_features * embed_dim, latent_dim)
        
        # Decoder components
        self.decoder_input = nn.Linear(latent_dim, self.num_features * embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=64),
            num_layers=2
        )
        
        # Output heads
        self.heads = nn.ModuleList()
        for ftype, params in feature_info:
            if ftype == 'categorical':
                self.heads.append(nn.Linear(embed_dim, params['num_classes']))
            else:
                self.heads.append(nn.Linear(embed_dim, 1))

    def encode(self, x):
        embeddings = []
        for i, (ftype, params) in enumerate(self.feature_info):
            feature = x[:, i].unsqueeze(1)
            if ftype == 'categorical':
                emb = self.embeddings[f'emb_{i}'](feature.long().squeeze())
            elif ftype == 'binary':
                emb = self.proj_binary(feature)
            else:
                emb = self.proj_numerical(feature)
            embeddings.append(emb.unsqueeze(0))
        
        embeddings = torch.cat(embeddings)
        encoded = self.transformer_encoder(embeddings)
        latent = self.latent_proj(encoded.permute(1, 0, 2).flatten(1))
        return latent

    def decode(self, latent):
        batch_size = latent.size(0)
        x = self.decoder_input(latent)
        x = x.view(batch_size, self.num_features, self.embed_dim).permute(1, 0, 2)
        decoded = self.transformer_decoder(x, x)
        
        outputs = []
        for i, head in enumerate(self.heads):
            outputs.append(head(decoded[i]))
        return torch.cat(outputs, dim=1)

    def forward(self, x):
        latent = self.encode(x)
        return self.decode(latent)

# Data preparation and preprocessing
def preprocess_data(df):
    # Identify feature types
    feature_info = []
    label_encoders = {}
    scalers = {}
    
    categorical = []
    binary = []
    numerical = []
    
    for col in df.columns:
        unique = df[col].nunique()
        if unique > 2:
            categorical.append(col)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
            feature_info.append(('categorical', {'num_classes': unique}))
        elif unique == 2:
            binary.append(col)
            df[col] = df[col].astype(int)
            feature_info.append(('binary', None))
        else:
            numerical.append(col)
            feature_info.append(('numerical', None))
    
    # Scale numerical features
    if numerical:
        scaler = MinMaxScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        scalers['numerical'] = scaler
    
    return df, feature_info, label_encoders, scalers

# Main execution
dataset = 'creditcard'
file_path = f'..\\..\\..\\..\\Extra_Datasets\\{dataset}.csv'
data = pd.read_csv(file_path)

# Preprocess entire dataset
X_orig = data.iloc[:, :-1]
y_orig = data.iloc[:, -1]

# Process minority class
real_minority = data[y_orig == 1]
X_minority = real_minority.iloc[:, :-1]
y_minority = real_minority.iloc[:, -1]

# Preprocess minority data
X_processed, feature_info, label_encoders, scalers = preprocess_data(X_minority.copy())
input_dim = X_processed.shape[1]

# Training parameters
multiplier_list = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3]
# multiplier_list = [0.25]
latent_dim = 4
num_epochs = 1000
batch_size = 32
# Initialize model
model = TabularTransformerAE(feature_info, latent_dim=latent_dim)
optimizer = Adam(model.parameters(), lr=0.001)

# Convert data to tensor
X_tensor = torch.tensor(X_processed.values, dtype=torch.float32)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        optimizer.zero_grad()
        
        reconstructed = model(batch)
        loss = 0
        
        # Calculate per-feature loss
        start_idx = 0
        for j, (ftype, params) in enumerate(feature_info):
            if ftype == 'categorical':
                end_idx = start_idx + params['num_classes']
                loss += F.cross_entropy(reconstructed[:, start_idx:end_idx], 
                                        batch[:, j].long())
                start_idx = end_idx
            else:
                end_idx = start_idx + 1
                if ftype == 'binary':
                    loss += F.binary_cross_entropy_with_logits(
                        reconstructed[:, start_idx], batch[:, j])
                else:
                    loss += F.mse_loss(reconstructed[:, start_idx], batch[:, j])
                start_idx = end_idx
        
        loss.backward()
        optimizer.step()

# Generate latent space
with torch.no_grad():
    latent = model.encode(X_tensor).numpy()

# Apply Forest Diffusion
forest_model = ForestDiffusionModel(
    X=latent,
    n_t=50,
    duplicate_K=100,
    diffusion_type='flow',
    n_jobs=-1
)
for multiplier in multiplier_list:
    print(f"Results for multiplier={multiplier}")
    
    
    
    # Generate synthetic latent samples
    synthetic_latent = forest_model.generate(batch_size=round(multiplier * len(X_tensor)))
    
    # Decode samples
    with torch.no_grad():
        synthetic_tensor = model.decode(torch.tensor(synthetic_latent, dtype=torch.float32))
    
    # Convert to original feature space
    synthetic_df = pd.DataFrame()
    start_idx = 0
    for j, col in enumerate(X_minority.columns):
        ftype, params = feature_info[j]
        
        if ftype == 'categorical':
            end_idx = start_idx + params['num_classes']
            probs = F.softmax(synthetic_tensor[:, start_idx:end_idx], dim=1)
            preds = torch.argmax(probs, dim=1).numpy()
            synthetic_df[col] = label_encoders[col].inverse_transform(preds)
            start_idx = end_idx
        else:
            end_idx = start_idx + 1
            if ftype == 'binary':
                preds = (torch.sigmoid(synthetic_tensor[:, start_idx]) > 0.5).numpy().astype(int)
                synthetic_df[col] = preds
            else:
                vals = synthetic_tensor[:, start_idx].numpy()
                synthetic_df[col] = scalers['numerical'].inverse_transform(vals.reshape(-1, 1))
            start_idx = end_idx
    
    # Rest of the evaluation pipeline (same as original)
    synthetic_df['Class'] = 1
    augmented_dataset = pd.concat([data, synthetic_df], ignore_index=True)
    
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_balanced= augmented_dataset.iloc[:, :-1]  
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    from xgboost import XGBClassifier  # Replacing RandomForestClassifier with XGBoost
    # Train classifier using XGBoost
    clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and evaluate
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))
    from sklearn.svm import SVC
    clf_orig = SVC(kernel='rbf', random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)
    
    clf_bal = SVC(kernel='rbf', random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recall_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recall_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



Results for multiplier=0.25
Recall score (original data): 0.8015
Recall score (generated data): 0.9191
Classification Report (original data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.95      0.80      0.87       136

    accuracy                           1.00     85443
   macro avg       0.97      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.92      0.93       136

    accuracy                           1.00     85443
   macro avg       0.97      0.96      0.96     85443
weighted avg       1.00      1.00      1.00     85443

Results for multiplier=0.5
Recall score (original data): 0.8015
Recall score (generated data): 0.9265
Classification Report (original data):
               precis