In [6]:
"""Data balancing via SMOTE

Reads from our dataset and synthesizes an enhanced dataset from the original data. This allows to 
publish the synthetic data without compromising the confidentiality of the clinical data, and to account for the 
disparity of the classes in the original samples. 
"""

# %%
import pandas as pd
from imblearn.over_sampling import (
    ADASYN,
    SMOTE,
    BorderlineSMOTE,
    KMeansSMOTE,
    SVMSMOTE,
    RandomOverSampler,
)

# Importa el dataset
dataset = pd.read_csv("data.csv")

# Define the desired number of samples per predictor class
predictor_col = "HOMA-IR alterado"

RANDOM_STATE = 10

In [7]:
def disasemble(dataset):
    dataset["Género"] = dataset["Género"].replace({"M": 0, "F": 1})
    dataset["ATPII/AHA/IDF"] = dataset["ATPII/AHA/IDF"].replace({"no": 0, "si": 1})
    dataset["aleator"] = dataset["aleator"].replace({"Control": 0, "PKU 1": 1, "PKU 2": 2})

    y_df = dataset[predictor_col].astype("category")
    X_df = dataset.drop(columns=predictor_col)

    return y_df, X_df

def reasemble(X_resampled, y_resampled):
    resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

    # Combine the original dataset and the synthetic dataset
    resampled_data["Género"] = resampled_data["Género"].replace({0: "M", 1: "F"})
    resampled_data["ATPII/AHA/IDF"] = resampled_data["ATPII/AHA/IDF"].replace(
        {0: "no", 1: "si"}
    )
    resampled_data["aleator"] = resampled_data["aleator"].replace(
        {0: "Control", 1: "PKU 1", 2: "PKU 2"}
    )

    return resampled_data

In [12]:
# Splits the labels
y_df, X_df = disasemble(dataset)

# SAMPLERS!
df_ADASYN_unmerged = ADASYN(
    random_state=RANDOM_STATE,
    n_neighbors=7,
).fit_resample(X_df, y_df)

df_SMOTE_unmerged = SMOTE(
    random_state=RANDOM_STATE,
    k_neighbors=7,
).fit_resample(X_df, y_df)

df_BorderlineSMOTE_unmerged = BorderlineSMOTE(
    random_state=RANDOM_STATE,
    k_neighbors=7,
    m_neighbors=10,
).fit_resample(X_df, y_df)

df_SVMSMOTE_unmerged = SVMSMOTE(
    random_state=RANDOM_STATE,
    k_neighbors=7,
    m_neighbors=10,
).fit_resample(X_df, y_df)

# df_KMeansSMOTE_unmerged = KMeansSMOTE(
#     random_state=RANDOM_STATE, k_neighbors=7,
# ).fit_resample(X_df, y_df)

df_RandomOverSampler_unmerged = RandomOverSampler(
    random_state=RANDOM_STATE,
).fit_resample(X_df, y_df)

# Pack it together
samplers_named = {
    "ADASYN": df_ADASYN_unmerged,
    "SMOTE": df_SMOTE_unmerged,
    "BorderlineSMOTE": df_BorderlineSMOTE_unmerged,
    "SVMSMOTE": df_SVMSMOTE_unmerged,
    # "KMeansSMOTE" : df_SVMSMOTE_unmerged, # BROKEN
    "RandomOverSampler": df_RandomOverSampler_unmerged,
}
# End Of Samplers 

# Save the synthetic dataset to a CSV file
for sampler in samplers_named:
    resampled_data = reasemble(samplers_named[sampler][0], samplers_named[sampler][1])
    resampled_data.to_csv(f"resampled_data_{sampler}.csv", index=False)