# Base Code
## Contains the model & classifier
#### Done for reduced diabetes

In [33]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='reduced_diabetes'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim_list=[4]
for dim in dim_list:
    print(f"# Result Metrics for Simple Autoencoder+ForestDiffusion for {dataset} dataset")
    print(f'for latent space dimensions={dim}')
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=len(X))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder+ForestDiffusion for reduced_diabetes dataset
for latent space dimensions=4
Recall score (original data): 0.1765
Recall score (generated data): 0.8235
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       151
           1       0.88      0.82      0.85        17

    accuracy                           0.97       168
   macro avg       0.93      0.91      0.92       168
weighted avg       0.97      0.97      0.97       168



# Reduced_Diabetes 
#### Looping through Percentages

In [31]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='reduced_diabetes'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=4
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=4)+ForestDiffusion for reduced_diabetes dataset at augmentation ratio = 25 % 
Recall score (original data): 0.1765
Recall score (generated data): 0.6471
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       151
           1       0.92      0.65      0.76        17

    accuracy                           0.96       168
   macro avg       0.94      0.82      0.87       168
weighted avg       0.96      0.96      0.96       168

# Result Metrics for Simple Autoencoder(latent s

Here is the table displaying the recall scores for both classes (0 and 1) in the generated data for each augmentation ratio:

| Augmentation Ratio (%) | Recall Score (Class 0) | Recall Score (Class 1) |
|------------------------|------------------------|------------------------|
| 25                     | 0.99                   | 0.65                   |
| 50                     | 0.99                   | 0.47                   |
| 75                     | 0.99                   | 0.82                   |
| 100                    | 0.99                   | 0.82                   |
| 125                    | 0.98                   | 0.94                   |
| 150                    | 0.99                   | 0.76                   |
| 175                    | 0.99                   | 0.76                   |
| 200                    | 0.97                   | 0.88                   |
| 225                    | 0.99                   | 0.94                   |
| 250                    | 0.96                   | 0.71                   |
| 275                    | 0.95                   | 0.88                   |
| 300                    | 0.98                   | 0.88                   |

# Diabetes 
#### Looping through Percentages

In [32]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='mammography'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=3
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=3)+ForestDiffusion for mammography dataset at augmentation ratio = 25 % 
Recall score (original data): 0.5946
Recall score (generated data): 0.6892
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3281
           1       0.92      0.59      0.72        74

    accuracy                           0.99      3355
   macro avg       0.95      0.80      0.86      3355
weighted avg       0.99      0.99      0.99      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.99      1.00      1.00      3281
           1       0.96      0.69      0.80        74

    accuracy                           0.99      3355
   macro avg       0.98      0.84      0.90      3355
weighted avg       0.99      0.99      0.99      3355

# Result Metrics for Simple Autoencoder(latent space 

Here is a table listing the recall scores for each class (`Class 0` and `Class 1`) based on the augmentation ratios:

| Augmentation Ratio (%) | Recall (Original Data) - Class 0 | Recall (Original Data) - Class 1 | Recall (Generated Data) - Class 0 | Recall (Generated Data) - Class 1 |
|-------------------------|-----------------------------------|-----------------------------------|------------------------------------|------------------------------------|
| 25                      | 0.97                            | 0.18                            | 0.99                               | 0.65                               |
| 50                      | 0.97                            | 0.18                            | 0.99                               | 0.47                               |
| 75                      | 0.97                            | 0.18                            | 0.99                               | 0.82                               |
| 100                     | 0.97                            | 0.18                            | 0.99                               | 0.82                               |
| 125                     | 0.97                            | 0.18                            | 0.98                               | 0.94                               |
| 150                     | 0.97                            | 0.18                            | 0.99                               | 0.76                               |
| 175                     | 0.97                            | 0.18                            | 0.99                               | 0.76                               |
| 200                     | 0.97                            | 0.18                            | 0.97                               | 0.88                               |
| 225                     | 0.97                            | 0.18                            | 0.99                               | 0.94                               |



# Coil_2000 
#### Looping through Percentages

In [36]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='coil_2000'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=20
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=20)+ForestDiffusion for coil_2000 dataset at augmentation ratio = 25 % 
Recall score (original data): 0.0440
Recall score (generated data): 0.3681
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.94      0.98      0.96      2765
           1       0.14      0.04      0.07       182

    accuracy                           0.92      2947
   macro avg       0.54      0.51      0.51      2947
weighted avg       0.89      0.92      0.90      2947

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.96      0.99      0.97      2765
           1       0.68      0.37      0.48       182

    accuracy                           0.95      2947
   macro avg       0.82      0.68      0.73      2947
weighted avg       0.94      0.95      0.94      2947

# Result Metrics for Simple Autoencoder(latent space d

Here’s a table showing the recall scores for each class at various augmentation ratios (up to 300%) for the `Simple Autoencoder(latent space dimensions=20)+ForestDiffusion` model applied to the `coil_2000` dataset:

| Augmentation Ratio (%) | Recall (-1) | Recall (1) |
|-------------------------|-------------|------------|
| 25                     | 0.99        | 0.37       |
| 50                     | 0.99        | 0.42       |
| 75                     | 0.99        | 0.40       |
| 100                    | 0.99        | 0.42       |
| 125                    | 0.99        | 0.37       |
| 150                    | 0.99        | 0.38       |
| 175                    | 0.99        | 0.46       |
| 200                    | 0.99        | 0.43       |
| 225                    | 0.99        | 0.45       |
| 250                    | 0.99        | 0.47       |
| 275                    | 0.99        | 0.49       |
| 300                    | 0.99        | 0.50       |



# coil_2000
#### Optimum no. of latent dimensions

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='coil_2000'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
list=[5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45]
for dim in list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at hidden dimensions = {dim} ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=5)+ForestDiffusion for coil_2000 dataset at hidden dimensions = 5 
Recall score (original data): 0.0440
Recall score (generated data): 0.4231
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.94      0.98      0.96      2765
           1       0.14      0.04      0.07       182

    accuracy                           0.92      2947
   macro avg       0.54      0.51      0.51      2947
weighted avg       0.89      0.92      0.90      2947

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.96      0.99      0.97      2765
           1       0.69      0.42      0.52       182

    accuracy                           0.95      2947
   macro avg       0.83      0.71      0.75      2947
weighted avg       0.95      0.95      0.95      2947

# Result Metrics for Simple Autoencoder(latent space dimens

Here is the complete table with recall scores for hidden dimensions up to 45:

| **Hidden Dimensions** | **Recall (-1)** (Original) | **Recall (1)** (Original) | **Recall (-1)** (Generated) | **Recall (1)** (Generated) |
|------------------------|---------------------------|---------------------------|-----------------------------|-----------------------------|
| 5                      | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 7                      | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 9                      | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 11                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 13                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 15                     | 0.98                      | 0.04                      | 0.98                        | 0.42                        |
| 17                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 19                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 21                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 23                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 25                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 27                     | 0.98                      | 0.04                      | 0.99                        | 0.42                        |
| 29                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 31                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 33                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 35                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 37                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 39                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 41                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 43                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |
| 45                     | 0.98                      | 0.04                      | 0.99                        | 0.43                        |



##### So I will use half dimensions as original data

# Mammography
#### Looping through Percentages

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='Mammography'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=3
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=3)+ForestDiffusion for Mammography dataset at augmentation ratio = 25 % 
Recall score (original data): 0.5946
Recall score (generated data): 0.7027
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3281
           1       0.92      0.59      0.72        74

    accuracy                           0.99      3355
   macro avg       0.95      0.80      0.86      3355
weighted avg       0.99      0.99      0.99      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.99      1.00      1.00      3281
           1       0.98      0.70      0.82        74

    accuracy                           0.99      3355
   macro avg       0.99      0.85      0.91      3355
weighted avg       0.99      0.99      0.99      3355

# Result Metrics for Simple Autoencoder(latent space 

# Reduced Cardio Train
#### Looping through Percentages

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='reduced_cardio_train'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=5
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=5)+ForestDiffusion for reduced_cardio_train dataset at augmentation ratio = 25 % 
Recall score (original data): 0.0840
Recall score (generated data): 0.6950
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     10517
           1       0.37      0.08      0.14      1190

    accuracy                           0.89     11707
   macro avg       0.64      0.53      0.54     11707
weighted avg       0.85      0.89      0.86     11707

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     10517
           1       0.88      0.69      0.78      1190

    accuracy                           0.96     11707
   macro avg       0.92      0.84      0.88     11707
weighted avg       0.96      0.96      0.96     11707

# Result Metrics for Simple Autoencoder(late

# Reduced Smart Grid Stability Dataset 
#### Looping through Percentages

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from ForestDiffusion import ForestDiffusionModel  # Assuming ForestDiffusionModel is available

# Define Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


# Load the dataset
dataset='reduced_smart_grid_stability'
file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
data = pd.read_csv(file_path)  # Replace with your dataset path

# Separate features and target

X_orig = data.iloc[:, :-1]  
y_orig = data.iloc[:, -1] 



real_minortiy = data[y_orig == 1]
X = real_minortiy.iloc[:, :-1]  # Assuming last column is the target
y = real_minortiy.iloc[:, -1]  # Target variable (if needed for evaluation)
# Preprocess features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_orig_scaled=scaler.fit_transform(X_orig)
# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
dim=6
multiplier_list=[0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3]
for multiplier in multiplier_list:
    print(f"# Result Metrics for Simple Autoencoder(latent space dimensions={dim})+ForestDiffusion for {dataset} dataset at augmentation ratio = {int(100*multiplier)} % ")
    # Initialize Autoencoder
    input_dim = X_tensor.shape[1]
    latent_dim = dim  # Low-dimensional space
    autoencoder = Autoencoder(input_dim, latent_dim)
    criterion = nn.MSELoss()
    optimizer = Adam(autoencoder.parameters(), lr=0.01)

    # Train the Autoencoder
    num_epochs = 2000
    batch_size = 32
    for epoch in range(num_epochs):
        autoencoder.train()
        for i in range(0, X_tensor.size(0), batch_size):
            batch = X_tensor[i:i+batch_size]
            optimizer.zero_grad()
            encoded, decoded = autoencoder(batch)
            loss = criterion(decoded, batch)
            loss.backward()
            optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Get Latent Space Representation
    autoencoder.eval()
    with torch.no_grad():
        latent_space = autoencoder.encoder(X_tensor).numpy()  # Encoded dataset in latent space

    # Apply ForestDiffusionModel
    forest_model = ForestDiffusionModel(
        X=latent_space,
        label_y=None,         # No target labels required for diffusion
        n_t=50,               # Number of timesteps
        duplicate_K=100,      # Augmentation factor
        diffusion_type='flow',# Flow-based diffusion
        n_jobs=-1             # Use all available CPU cores
    )

    # Generate synthetic samples
    batch_size = 100  # Adjust the batch size for your needs
    synthetic_latent_space = forest_model.generate(batch_size=round(multiplier*len(X)))

    # Decode the synthetic samples back into feature space
    synthetic_tensor = torch.tensor(synthetic_latent_space, dtype=torch.float32)
    with torch.no_grad():
        synthetic_features = autoencoder.decoder(synthetic_tensor).numpy()  # Decoded synthetic features
    # scale it back
    synthetic_features=scaler.inverse_transform(synthetic_features)
    # Combine original and synthetic datasets
    synthetic_df = pd.DataFrame(synthetic_features, columns=X.columns)  # Convert synthetic features to DataFrame
    synthetic_df['target'] = 1  # Assign a label for synthetic data (if needed)

    original_df = pd.DataFrame(X_orig, columns=X.columns)
    original_df['target'] = y_orig.values  # Append original target

    # Concatenate original and synthetic datasets
    augmented_dataset = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save the augmented dataset
    # augmented_dataset.to_csv('augmented_dataset__(simple)autoencoder-forest_reduced_diabetes.csv', index=False)

    # print("Augmented dataset created and saved successfully.")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_balanced= augmented_dataset.iloc[:, :-1]  
    y_balanced = augmented_dataset.iloc[:, -1] 
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")

    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))



# Result Metrics for Simple Autoencoder(latent space dimensions=6)+ForestDiffusion for reduced_smart_grid_stability dataset at augmentation ratio = 25 % 
Recall score (original data): 0.4338
Recall score (generated data): 0.8344
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      1912
           1       0.96      0.43      0.60       302

    accuracy                           0.92      2214
   macro avg       0.94      0.72      0.78      2214
weighted avg       0.92      0.92      0.91      2214

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1912
           1       0.99      0.83      0.90       302

    accuracy                           0.98      2214
   macro avg       0.98      0.92      0.95      2214
weighted avg       0.98      0.98      0.98      2214

# Result Metrics for Simple Autoenco