# This notebook tries to augment data using forest diffusion on datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score


metrics_list = []
# Step 1: Load the CSV file
#file_path = 'creditcard.csv'  # Update this path to your local CSV file
# strings_set = {'diabetes','oil','yeast_ml8_dataset','creditcard_sampled','HTRU','mammography'}
strings_set = {'diabetes','oil','creditcard_sampled','HTRU','mammography'}
for dataset in strings_set:
    print(f"# Result Metrics for Vanilla ForestDiffusion for {dataset} dataset")
    file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
    
    data = pd.read_csv(file_path)

    # Step 2: Inspect the data and check for class imbalance
    # Assuming the last column is the label, and the rest are features
    X = data.iloc[:, :-1].values  # Features
    y = data.iloc[:, -1].values  # Labels (binary classification)

    # Check and print the original class distribution
    unique, counts = np.unique(y, return_counts=True)
    class_dist_before = dict(zip(unique, counts))
    print(f"Class distribution before augmentation: {class_dist_before}")

    # # Step 3: Plot the original imbalanced data (first two features for visualization)
    # plt.figure(figsize=(10, 5))
    # plt.subplot(1, 2, 1)
    # plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', label='Original Data', s=1)
    # plt.title('Original Imbalanced Data')
    # plt.xlabel('Feature 1')
    # plt.ylabel('Feature 2')
    # plt.show()

    # Separate the minority class
    X_minority = X[y == 1]
    y_minority = y[y==1]
    # Identify integer columns
    int_columns = data.select_dtypes(include=['int']).columns
    int_indexes = []
    for col in int_columns:
        col_index = data.columns.get_loc(col)
        int_indexes.append(col_index)
    import pandas as pd
    # Step 4: Upsample the minority class using ForestDiffusionModel
    forest_model = ForestDiffusionModel(X_minority, label_y=y_minority, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1)
    Xy_minority_fake = forest_model.generate(batch_size=100 )  # Adjust the batch size to create a balanced dataset
    # Add generated samples to the main imbalanced dataset
    X_minority_fake = Xy_minority_fake[:, :-1]   # Features
    y_minority_fake = Xy_minority_fake[:, -1] # Labels (binary classification)
    X_balanced = np.concatenate((X, X_minority_fake), axis=0)
    y_balanced = np.concatenate((y, y_minority_fake), axis=0)
    
    # # Step 5: Plot the generated data (first two features for visualization)
    # plt.subplot(1, 2, 2)
    # plt.scatter(X_balanced[:, 0], X_balanced[:, 1], c=y_balanced, cmap='viridis', label='Generated Data', s=1)
    # plt.title('Data After Generation')
    # plt.xlabel('Feature 1')
    # plt.ylabel('Feature 2')
    # plt.show()

    # Check and print the class distribution after augmentation
    unique_bal, counts_bal = np.unique(y_balanced, return_counts=True)
    class_dist_after = dict(zip(unique_bal, counts_bal))
    print(f"Class distribution after augmentation: {class_dist_after}")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    prec_orig = precision_score(y_test_orig, y_pred_orig)
    prec_bal = precision_score(y_test_orig, y_pred_bal)


    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    f1_orig = f1_score(y_test_orig, y_pred_orig)
    f1_bal = f1_score(y_test_orig, y_pred_bal)

    # Step 9: Print the performance metrics
    metrics = {
        "Dataset": dataset,
        "Precision_Original": prec_orig,
        "Recall_Original": recall_orig,
        "F1_Original": f1_orig,
        "Precision_Generated": prec_bal,
        "Recall_Generated": recalls_bal,
        "F1_Generated": f1_bal,
        "Num_Fake_Samples": len(X_balanced) - len(X),
        "Synthetic/Original_Ratio":100*(len(X_balanced) - len(X))/len(Xy_minority_fake)
    }

    # Append the dictionary to the list
    metrics_list.append(metrics)
    
    
    
    print(f"Precision score (original data): {prec_orig:.4f}")
    print(f"Precision score (generated data): {prec_bal:.4f}")
    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")
    print(f"F1 score (original data): {f1_orig:.4f}")
    print(f"F1 score (generated data): {f1_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))

    # Step 10: Print the number of fake samples generated
    print(f"Number of fake samples generated: {len(X_minority_fake)}")
    


# Result Metrics for Vanilla ForestDiffusion for HTRU dataset
Class distribution before augmentation: {0: 16259, 1: 1639}
Class distribution after augmentation: {0.0: 16259, 1.0: 1739}
Precision score (original data): 0.9376
Precision score (generated data): 0.9703
Recall score (original data): 0.8354
Recall score (generated data): 0.8745
F1 score (original data): 0.8836
F1 score (generated data): 0.9199
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4884
           1       0.94      0.84      0.88       486

    accuracy                           0.98      5370
   macro avg       0.96      0.91      0.94      5370
weighted avg       0.98      0.98      0.98      5370

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4884
           1       0.97      0.87      0.92       486

    accuracy        

In [4]:
# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Save the DataFrame to a CSV file
metrics_df.to_csv("Vanilla_Forest_different_datasets_metric.csv", index=False)

# Print the DataFrame
print(metrics_df)

              Dataset  Precision_Original  Recall_Original  F1_Original  \
0         mammography            0.916667         0.594595     0.721311   
1                 oil            0.600000         0.272727     0.375000   
2  creditcard_sampled            0.866667         0.764706     0.812500   
3                HTRU            0.937644         0.835391     0.883569   
4            diabetes            0.638554         0.662500     0.650307   

   Precision_Generated  Recall_Generated  F1_Generated  Num_Fake_Samples  \
0             0.964286          0.729730      0.830769               100   
1             0.900000          0.818182      0.857143               100   
2             0.882353          0.882353      0.882353               100   
3             0.970320          0.874486      0.919913               100   
4             0.735632          0.800000      0.766467               100   

   Synthetic/Original_Ratio  
0                     100.0  
1                     100.0  
2 

In [11]:
metrics_df[metrics_df['Dataset']=='creditcard_sampled'] 

Unnamed: 0,Dataset,Precision_Original,Recall_Original,F1_Original,Precision_Generated,Recall_Generated,F1_Generated,Num_Fake_Samples,Synthetic/Original_Ratio
2,creditcard_sampled,0.866667,0.764706,0.8125,0.882353,0.882353,0.882353,100,100.0


In [9]:
metrics_df

Unnamed: 0,Dataset,Precision_Original,Recall_Original,F1_Original,Precision_Generated,Recall_Generated,F1_Generated,Num_Fake_Samples,Synthetic/Original_Ratio
0,mammography,0.916667,0.594595,0.721311,0.964286,0.72973,0.830769,100,100.0
1,oil,0.6,0.272727,0.375,0.9,0.818182,0.857143,100,100.0
2,creditcard_sampled,0.866667,0.764706,0.8125,0.882353,0.882353,0.882353,100,100.0
3,HTRU,0.937644,0.835391,0.883569,0.97032,0.874486,0.919913,100,100.0
4,diabetes,0.638554,0.6625,0.650307,0.735632,0.8,0.766467,100,100.0


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from ForestDiffusion import ForestDiffusionModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score


metrics_list = []
# Step 1: Load the CSV file
#file_path = 'creditcard.csv'  # Update this path to your local CSV file
# strings_set = {'diabetes','oil','yeast_ml8_dataset','creditcard_sampled','HTRU','mammography'}
strings_set = {'reduced_diabetes'}
for dataset in strings_set:
    print(f"# Result Metrics for Vanilla ForestDiffusion for {dataset} dataset")
    file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
    
    data = pd.read_csv(file_path)

    # Step 2: Inspect the data and check for class imbalance
    # Assuming the last column is the label, and the rest are features
    X = data.iloc[:, :-1].values  # Features
    y = data.iloc[:, -1].values  # Labels (binary classification)

    # Check and print the original class distribution
    unique, counts = np.unique(y, return_counts=True)
    class_dist_before = dict(zip(unique, counts))
    print(f"Class distribution before augmentation: {class_dist_before}")

    # # Step 3: Plot the original imbalanced data (first two features for visualization)
    # plt.figure(figsize=(10, 5))
    # plt.subplot(1, 2, 1)
    # plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', label='Original Data', s=1)
    # plt.title('Original Imbalanced Data')
    # plt.xlabel('Feature 1')
    # plt.ylabel('Feature 2')
    # plt.show()

    # Separate the minority class
    X_minority = X[y == 1]
    y_minority = y[y==1]
    # Identify integer columns
    int_columns = data.select_dtypes(include=['int']).columns
    int_indexes = []
    for col in int_columns:
        col_index = data.columns.get_loc(col)
        int_indexes.append(col_index)
    import pandas as pd
    # Step 4: Upsample the minority class using ForestDiffusionModel
    forest_model = ForestDiffusionModel(X_minority, label_y=y_minority, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1)
    Xy_minority_fake = forest_model.generate(batch_size=60)  # Adjust the batch size to create a balanced dataset
    # Add generated samples to the main imbalanced dataset
    X_minority_fake = Xy_minority_fake[:, :-1]   # Features
    y_minority_fake = Xy_minority_fake[:, -1] # Labels (binary classification)
    X_balanced = np.concatenate((X, X_minority_fake), axis=0)
    y_balanced = np.concatenate((y, y_minority_fake), axis=0)
    
    # # Step 5: Plot the generated data (first two features for visualization)
    # plt.subplot(1, 2, 2)
    # plt.scatter(X_balanced[:, 0], X_balanced[:, 1], c=y_balanced, cmap='viridis', label='Generated Data', s=1)
    # plt.title('Data After Generation')
    # plt.xlabel('Feature 1')
    # plt.ylabel('Feature 2')
    # plt.show()

    # Check and print the class distribution after augmentation
    unique_bal, counts_bal = np.unique(y_balanced, return_counts=True)
    class_dist_after = dict(zip(unique_bal, counts_bal))
    print(f"Class distribution after augmentation: {class_dist_after}")

    # Step 6: Split the dataset into training and test sets (original and balanced)
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    # Step 7: Train a simple classifier on both original and generated datasets
    clf_orig = RandomForestClassifier(random_state=42)
    clf_orig.fit(X_train_orig, y_train_orig)

    clf_bal = RandomForestClassifier(random_state=42)
    clf_bal.fit(X_train_bal, y_train_bal)

    # Step 8: Predict and calculate recall and F1 scores
    y_pred_orig = clf_orig.predict(X_test_orig)
    y_pred_bal = clf_bal.predict(X_test_orig)

    prec_orig = precision_score(y_test_orig, y_pred_orig)
    prec_bal = precision_score(y_test_orig, y_pred_bal)


    recall_orig = recall_score(y_test_orig, y_pred_orig)
    recalls_bal = recall_score(y_test_orig, y_pred_bal)

    f1_orig = f1_score(y_test_orig, y_pred_orig)
    f1_bal = f1_score(y_test_orig, y_pred_bal)

    # Step 9: Print the performance metrics
    metrics = {
        "Dataset": dataset,
        "Precision_Original": prec_orig,
        "Recall_Original": recall_orig,
        "F1_Original": f1_orig,
        "Precision_Generated": prec_bal,
        "Recall_Generated": recalls_bal,
        "F1_Generated": f1_bal,
        "Num_Fake_Samples": len(X_balanced) - len(X),
        "Synthetic/Original_Ratio":100*(len(X_balanced) - len(X))/len(Xy_minority_fake)
    }

    # Append the dictionary to the list
    metrics_list.append(metrics)
    
    
    
    print(f"Precision score (original data): {prec_orig:.4f}")
    print(f"Precision score (generated data): {prec_bal:.4f}")
    print(f"Recall score (original data): {recall_orig:.4f}")
    print(f"Recall score (generated data): {recalls_bal:.4f}")
    print(f"F1 score (original data): {f1_orig:.4f}")
    print(f"F1 score (generated data): {f1_bal:.4f}")
    print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
    print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))

    # Step 10: Print the number of fake samples generated
    print(f"Number of fake samples generated: {len(X_minority_fake)}")
    

# Result Metrics for Vanilla ForestDiffusion for reduced_diabetes dataset
Class distribution before augmentation: {0: 500, 1: 60}
Class distribution after augmentation: {0.0: 500, 1.0: 120}
Precision score (original data): 0.4286
Precision score (generated data): 0.8750
Recall score (original data): 0.1765
Recall score (generated data): 0.8235
F1 score (original data): 0.2500
F1 score (generated data): 0.8485
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       151
           1       0.43      0.18      0.25        17

    accuracy                           0.89       168
   macro avg       0.67      0.57      0.60       168
weighted avg       0.86      0.89      0.87       168

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       151
           1       0.88      0.82      0.85        17

    accuracy   

In [5]:
import pandas as pd
import numpy as np

# Assuming data is the existing dataframe with column names
column_names = list(data.columns[:-1])  # Exclude the target column name
target_column_name = data.columns[-1]  # Name of the target column

# Create the DataFrame from X_balanced
X_balanced_df = pd.DataFrame(X_balanced, columns=column_names)

# Add the target column
X_balanced_df[target_column_name] = y_balanced

X_balanced_df.to_csv('augmented_dataset__only-forest_reduced_diabetes.csv', index=False)