In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('HTRU_2.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")

In [None]:
strings_set = ['diabetes','mammography','creditcard_sampled','spambase','reduced_diabetes','reduced_spambase']
# strings_set = {'diabetes'}
Model = 'AutoDiff'
metrics_list = []
epoch=2000
sample_size_percent_list = [50,100,200]
for dataset in strings_set:
    
    recall_list =[]
    for sample_size_percent in sample_size_percent_list:
        print(f"Recall Score for AutoDiff Autoencoder & ForestDIffusion for {dataset} dataset")
        if dataset == 'creditcard':
            file_path = f'..\\..\\..\\..\\Extra_Datasets\\{dataset}.csv'
        elif dataset == 'reduced_diabetes':
            file_path = f'..\\..\\..\\Datasets\\Original Data\\diabetes.csv'
        elif dataset == 'reduced_spambase':
            file_path = f'..\\..\\..\\Datasets\\Original Data\\spambase.csv'
        else:
            file_path = f'..\\..\\..\\Datasets\\Original Data\\{dataset}.csv'
        # Read dataframe
        # print(file_path)
        real_df = pd.read_csv(file_path)
        #real_df = real_df.drop('url', axis=1)
        # # Step 2: Inspect the data and check for class imbalance
        # # Assuming the last column is the label, and the rest are features
        X = real_df.iloc[:, :-1].values  # Features
        y = real_df.iloc[:, -1].values  # Labels (binary classification)
        #  # Separate the minority class
        # Find the minority class
        real_minortiy = real_df[y == 1]
        if dataset == 'reduced_diabetes':
            real_minortiy = real_minortiy.sample(n=len(real_minortiy) - 68, random_state=42)
        elif dataset == 'reduced_spambase':
            real_minortiy = real_minortiy.sample(n=len(real_minortiy) - 813, random_state=42)
        threshold = 0.01 # Threshold for mixed-type variables
        parser = pce.DataFrameParser().fit(real_minortiy, threshold)
        ################################################################################################################
        # Auto-encoder hyper-parameters
        device = 'cuda' #@param {'type':'string'}
        n_epochs = epoch #@param {'type':'integer'}
        eps = 1e-5 #@param {type:"number"}
        weight_decay = 1e-6 #@param {'type':'number'}
        maximum_learning_rate = 1e-2 #@param {'type':'number'}
        lr = 2e-4 #@param {'type':'number'}
        hidden_size = 250
        num_layers = 3
        batch_size = real_minortiy.shape[0] # Full batch

        ds = ae.train_autoencoder(real_minortiy, hidden_size, num_layers, lr, weight_decay, n_epochs, batch_size, threshold)
        latent_features = ds[1].detach()

        from ForestDiffusion import ForestDiffusionModel


        # Convert to NumPy array
        array = latent_features.detach().cpu().numpy()
        forest_model = ForestDiffusionModel(array, label_y=None, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1)
        ##### This is the improvisation to generate samples when augmentation ration is more than 100%
        div=math.ceil(sample_size_percent/100)
        gen_df = pd.DataFrame()
        for i in range (div):
            minority_fake = forest_model.generate(batch_size=len(real_minortiy)) # Adjust the batch size to create a balanced dataset
            sample=torch.tensor(minority_fake, dtype=torch.float32)
            sample.shape
            gen_output = ds[0](sample, ds[2], ds[3])
            batch_df = pce.convert_to_table(real_minortiy, gen_output, threshold)
            gen_df =pd.concat([gen_df, batch_df], ignore_index=True)
            

        # output_directory =  f'..\\..\\..\\Datasets\\Synthetic Data\\'
        # filename = f'{Model}+Forest_{dataset}_Synthetic.csv'
        # output_file = os.path.join(output_directory, filename)
        # gen_df.to_csv(output_file, index=False) 


        # Select a random sample of the generated data
        num_samples_to_generate = len(real_minortiy)*sample_size_percent // 100
        selected_samples = gen_df.sample(n=num_samples_to_generate, random_state=42)  # For reproducibility
        # Syn _df will be the dataset after augmentation
        syn_df = pd.concat([real_df, selected_samples], ignore_index=True)


        augmented_output_directory =  f'..\\..\\..\\Datasets\\Augmented Data\\'
        filename = f'{Model}+Forest_{dataset}_Augmented.csv'
        augmented_output_file = os.path.join(augmented_output_directory, filename)
        # syn_df.to_csv(augmented_output_file, index=False) 

        import pandas as pd
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
        import numpy as np
        import matplotlib.pyplot as plt
        from ForestDiffusion import ForestDiffusionModel
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import recall_score, f1_score

        # real_df = pd.read_csv(filename)
        # syn_filename = f'{string}/{Model}_{string}_Augmented.csv'

        # augmented_df = pd.read_csv(syn_filename)
        augmented_df=syn_df

        X = real_df.iloc[:, :-1].values  # Features
        y = real_df.iloc[:, -1].values 
        # Check and print the original class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_dist_before = dict(zip(unique, counts))
        print(f"Class distribution before augmentation: {class_dist_before}")# Labels (binary classification)

        X_balanced = augmented_df.iloc[:, :-1].values  # Features
        y_balanced = augmented_df.iloc[:, -1].values  # Labels (binary classification)

        # Check and print the Augmented class distribution
        unique, counts = np.unique(y_balanced, return_counts=True)
        class_dist_after = dict(zip(unique, counts))
        print(f"Class distribution after augmentation: {class_dist_after}")

        # Step 6: Split the dataset into training and test sets (original and balanced)
        X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

        # Step 7: Train a simple classifier on both original and generated datasets
        clf_orig = RandomForestClassifier(random_state=42)
        clf_orig.fit(X_train_orig, y_train_orig)

        clf_bal = RandomForestClassifier(random_state=42)
        clf_bal.fit(X_train_bal, y_train_bal)

        # Step 8: Predict and calculate recall and F1 scores
        y_pred_orig = clf_orig.predict(X_test_orig)
        y_pred_bal = clf_bal.predict(X_test_orig)

        prec_orig = precision_score(y_test_orig, y_pred_orig)
        prec_bal = precision_score(y_test_orig, y_pred_bal)
        
        recall_orig = recall_score(y_test_orig, y_pred_orig)
        recalls_bal = recall_score(y_test_orig, y_pred_bal)

        # f1_orig = f1_score(y_test_orig, y_pred_orig)
        # f1_bal = f1_score(y_test_orig, y_pred_bal)

        # Step 9: Print and store the performance metrics
        # Store metrics in a dictionary
        metrics = {
        "Dataset": dataset,
        "Augmentaion Ratio":f'{sample_size_percent}%',
        "Recall_Original": recall_orig,
        "Recall_Generated": recalls_bal,
        
        }

        # Append the dictionary to the list
        metrics_list.append(metrics)
        recall_list.append(recalls_bal)
        # print(f"Precision score (original data): {prec_orig:.4f}")
        # print(f"Precision score (generated data): {prec_bal:.4f}")
        # print(f"Recall score (original data): {recall_orig:.4f}")
        # print(f"Recall score (generated data): {recalls_bal:.4f}")
        # print(f"F1 score (original data): {f1_orig:.4f}")
        # print(f"F1 score (generated data): {f1_bal:.4f}")
        # print("Classification Report (original data):\n", classification_report(y_test_orig, y_pred_orig))
        # print("Classification Report (generated data):\n", classification_report(y_test_orig, y_pred_bal))


        # print(f"Number of fake samples generated: {len(augmented_df)-len(real_df)}")
    # Convert the list of dictionaries into a DataFrame
    import matplotlib.pyplot as plt

    # Create the plot
    plt.plot(sample_size_percent_list, recall_list, marker='o', linestyle='-', color='b', label='Recall Score (Minor Class)')
    # Adding title and labels
    plt.title(f'Recall Score vs Augmentation Ratio \n at 2000 epochs \n for {dataset} dataset')
    plt.xlabel('Augmentation Ratio')
    plt.ylabel('Recall Score (Augmented Data)')
    plt.legend(loc="upper left")

    # Show the grid
    plt.grid()

    # Display the plot

    diagram_name=f'Recall Score vs Augmentation Ratio for {dataset} dataset(AutoDiffandForest).png'
    plt.savefig(diagram_name)
    plt.show()