In [1]:
import sys
import os

# Define the path to your folder
folder_path = f'..\\..\\..\\Models\\AutoDiffusion' 

# Add the folder to sys.path
sys.path.append(folder_path)

In [2]:
import numpy as np
#import process_edited as pce
import process_GQ as pce
import autoencoder as ae
import diffusion as diff
import TabDDPMdiff as TabDiff
import pandas as pd
import torch
import os
import time
import numpy as np

In [3]:
# Function to adjust the correlations of the generated data
def adjust_correlation(X, target_corr_matrix):
    L = np.linalg.cholesky(target_corr_matrix)
    X_centered = X - np.mean(X, axis=0)
    X_transformed = np.dot(X_centered, L.T)
    return X_transformed + np.mean(X, axis=0)



In [4]:
strings_set = {'diabetes','oil','yeast_ml8_dataset','creditcard_sampled','HTRU','mammography'}
Model = 'AutoDiff'
# dataset = 'diabetes'
metrics_list = []
for dataset in strings_set:
    print(f"Result Metrics for AutoDiff Autoencoder & ForestDIffusion for {dataset} dataset")
    file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
    # Read dataframe
    # print(file_path)
    real_df = pd.read_csv(file_path)
    #real_df = real_df.drop('url', axis=1)
    # # Step 2: Inspect the data and check for class imbalance
    # # Assuming the last column is the label, and the rest are features
    X = real_df.iloc[:, :-1].values  # Features
    y = real_df.iloc[:, -1].values  # Labels (binary classification)
    #  # Separate the minority class
    # Find the minority class

    real_minortiy = real_df[y == 1]

    threshold = 0.01 # Threshold for mixed-type variables
    parser = pce.DataFrameParser().fit(real_minortiy, threshold)
    ################################################################################################################
    # Auto-encoder hyper-parameters
    device = 'cuda' #@param {'type':'string'}
    n_epochs = 2000 #@param {'type':'integer'}
    eps = 1e-5 #@param {type:"number"}
    weight_decay = 1e-6 #@param {'type':'number'}
    maximum_learning_rate = 1e-2 #@param {'type':'number'}
    lr = 2e-4 #@param {'type':'number'}
    hidden_size = 250
    num_layers = 3
    batch_size = real_minortiy.shape[0] # Full batch

    ds = ae.train_autoencoder(real_minortiy, hidden_size, num_layers, lr, weight_decay, n_epochs, batch_size, threshold)
    latent_features = ds[1].detach()

    from ForestDiffusion import ForestDiffusionModel


    # Convert to NumPy array
    array = latent_features.detach().cpu().numpy()

    correlation_matrix = pd.DataFrame(array).corr()
    forest_model = ForestDiffusionModel(array, label_y=None, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1)
    minority_fake = forest_model.generate(batch_size=len(real_minortiy)) # Adjust the batch size to create a balanced dataset


    # Step 6: Adjust the synthetic data to match the original data's correlation matrix
    X_minority_fake_adjusted = adjust_correlation(minority_fake, correlation_matrix)


    sample=torch.tensor(X_minority_fake_adjusted, dtype=torch.float32)
    sample.shape
    gen_output = ds[0](sample, ds[2], ds[3])
    gen_df = pce.convert_to_table(real_minortiy, gen_output, threshold)

    output_directory =  f'..\\..\\..\\Datasets\\Synthetic Data\\'
    filename = f'{Model}+Forest_{dataset}_Synthetic.csv'
    output_file = os.path.join(output_directory, filename)
    gen_df.to_csv(output_file, index=False) 


    # Select a random sample of the generated data
    selected_samples = gen_df.sample(n=min(100,gen_df.shape[0]), random_state=42)  # For reproducibility
    # Syn _df will be the dataset after augmentation
    syn_df = pd.concat([real_df, selected_samples], ignore_index=True)


    augmented_output_directory =  f'..\\..\\..\\Datasets\\Augmented Data\\'
    filename = f'{Model}+Forest_{dataset}_Augmented.csv'
    augmented_output_file = os.path.join(augmented_output_directory, filename)
    syn_df.to_csv(augmented_output_file, index=False) 

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
    import numpy as np
    import matplotlib.pyplot as plt
    from ForestDiffusion import ForestDiffusionModel
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import recall_score, f1_score

    # real_df = pd.read_csv(filename)
    # syn_filename = f'{string}/{Model}_{string}_Augmented.csv'

    # augmented_df = pd.read_csv(syn_filename)
    augmented_df=syn_df

    X = real_df.iloc[:, :-1].values  # Features
    y = real_df.iloc[:, -1].values 
    # Check and print the original class distribution
    unique, counts = np.unique(y, return_counts=True)
    class_dist_before = dict(zip(unique, counts))
    print(f"Class distribution before augmentation: {class_dist_before}")# Labels (binary classification)

    X_balanced = augmented_df.iloc[:, :-1].values  # Features
    y_balanced = augmented_df.iloc[:, -1].values  # Labels (binary classification)

    # Check and print the Augmented class distribution
    unique, counts = np.unique(y_balanced, return_counts=True)
    class_dist_after = dict(zip(unique, counts))
    print(f"Class distribution after augmentation: {class_dist_after}")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    prec_orig = precision_score(y_test_orig, y_pred_orig)
    prec_bal = precision_score(y_test_orig, y_pred_bal)

    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    f1_orig = f1_score(y_test_orig, y_pred_orig)
    f1_bal = f1_score(y_test_orig, y_pred_bal)

    # Step 9: Print and store the performance metrics
    # Store metrics in a dictionary
    metrics = {
    "Dataset": dataset,
    "Precision_Original": prec_orig,
    "Precision_Generated": prec_bal,
    "Recall_Original": recall_orig,
    "Recall_Generated": recalls_bal,
    "F1_Original": f1_orig,   
    "F1_Generated": f1_bal,
    "Num_Fake_Samples": len(augmented_df) - len(real_df),
    "Synthetic/Original_Ratio":100*(len(augmented_df) - len(real_df))/len(real_minortiy)
    }

    # Append the dictionary to the list
    metrics_list.append(metrics)

    print(f"Precision score (original data): {prec_orig:.4f}")
    print(f"Precision score (generated data): {prec_bal:.4f}")
    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")
    print(f"F1 score (original data): {f1_orig:.4f}")
    print(f"F1 score (generated data): {f1_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


    print(f"Number of fake samples generated: {len(augmented_df)-len(real_df)}")

Result Metrics for AutoDiff Autoencoder & ForestDIffusion for HTRU dataset


  0%|          | 0/2000 [00:00<?, ?it/s]

Class distribution before augmentation: {0: 16259, 1: 1639}
Class distribution after augmentation: {0.0: 16259, 1.0: 1739}
Precision score (original data): 0.9376
Precision score (generated data): 0.9727
Recall score (original data): 0.8354
Recall score (generated data): 0.8807
F1 score (original data): 0.8836
F1 score (generated data): 0.9244
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4884
           1       0.94      0.84      0.88       486

    accuracy                           0.98      5370
   macro avg       0.96      0.91      0.94      5370
weighted avg       0.98      0.98      0.98      5370

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4884
           1       0.97      0.88      0.92       486

    accuracy                           0.99      5370
   macro avg       0.98     

  0%|          | 0/2000 [00:00<?, ?it/s]

Class distribution before augmentation: {0: 500, 1: 268}
Class distribution after augmentation: {0.0: 500, 1.0: 368}
Precision score (original data): 0.6386
Precision score (generated data): 0.7647
Recall score (original data): 0.6625
Recall score (generated data): 0.8125
F1 score (original data): 0.6503
F1 score (generated data): 0.7879
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.90      0.87      0.88       151
           1       0.76      0.81      0.79        80

    accuracy                           0.85       231
   macro avg       0.83      0.84 

  0%|          | 0/2000 [00:00<?, ?it/s]

Class distribution before augmentation: {-1: 10923, 1: 260}
Class distribution after augmentation: {-1.0: 10923, 1.0: 360}
Precision score (original data): 0.9167
Precision score (generated data): 0.9310
Recall score (original data): 0.5946
Recall score (generated data): 0.7297
F1 score (original data): 0.7213
F1 score (generated data): 0.8182
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3281
           1       0.92      0.59      0.72        74

    accuracy                           0.99      3355
   macro avg       0.95      0.80      0.86      3355
weighted avg       0.99      0.99      0.99      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.99      1.00      1.00      3281
           1       0.93      0.73      0.82        74

    accuracy                           0.99      3355
   macro avg       0.96     

  0%|          | 0/2000 [00:00<?, ?it/s]

Class distribution before augmentation: {0: 4000, 1: 50}
Class distribution after augmentation: {0.0: 4000, 1.0: 100}
Precision score (original data): 0.8667
Precision score (generated data): 0.8750
Recall score (original data): 0.7647
Recall score (generated data): 0.8235
F1 score (original data): 0.8125
F1 score (generated data): 0.8485
Classification Report (original data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1198
           1       0.87      0.76      0.81        17

    accuracy                           1.00      1215
   macro avg       0.93      0.88      0.90      1215
weighted avg       0.99      1.00      0.99      1215

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1198
           1       0.88      0.82      0.85        17

    accuracy                           1.00      1215
   macro avg       0.94      0.91

  0%|          | 0/2000 [00:00<?, ?it/s]

Class distribution before augmentation: {-1: 2239, 1: 178}
Class distribution after augmentation: {-1.0: 2239, 1.0: 278}
Precision score (original data): 0.0000
Precision score (generated data): 1.0000
Recall score (original data): 0.0000
Recall score (generated data): 0.5510
F1 score (original data): 0.0000
F1 score (generated data): 0.7105
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.93      1.00      0.97       677
           1       0.00      0.00      0.00        49

    accuracy                           0.93       726
   macro avg       0.47      0.50      0.48       726
weighted avg       0.87      0.93      0.90       726

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       0.97      1.00      0.98       677
           1       1.00      0.55      0.71        49

    accuracy                           0.97       726
   macro avg       0.98      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/2000 [00:00<?, ?it/s]

LinAlgError: Matrix is not positive definite

In [5]:
metrics_df = pd.DataFrame(metrics_list)
metrics_df.head()

Unnamed: 0,Dataset,Precision_Original,Precision_Generated,Recall_Original,Recall_Generated,F1_Original,F1_Generated,Num_Fake_Samples,Synthetic/Original_Ratio
0,HTRU,0.937644,0.972727,0.835391,0.880658,0.883569,0.924406,100,6.101281
1,diabetes,0.638554,0.764706,0.6625,0.8125,0.650307,0.787879,100,37.313433
2,mammography,0.916667,0.931034,0.594595,0.72973,0.721311,0.818182,100,38.461538
3,creditcard_sampled,0.866667,0.875,0.764706,0.823529,0.8125,0.848485,50,100.0
4,yeast_ml8_dataset,0.0,1.0,0.0,0.55102,0.0,0.710526,100,56.179775
