In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# 1. Load data asli
data = pd.read_csv("../Data/breast-cancer-dataset.csv")

X = data[["SDNN", "rMSSD", "pNN50"]]  # fitur
y = data["Class"]                     # label

print("Data asli:", data.shape, " | Distribusi kelas:")
print(y.value_counts())

# 2. SMOTE untuk menyeimbangkan kelas
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

balanced = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name="Class")], axis=1)
print("Setelah SMOTE:", balanced.shape, " | Distribusi kelas:")
print(balanced["Class"].value_counts())

# 3. Perbanyak data dengan jittering (misal 10x lipat)
multiplier = 10
augmented = []

for i in range(multiplier):
    temp = balanced.copy()
    noise = np.random.normal(0, 0.01, temp[["SDNN", "rMSSD", "pNN50"]].shape)
    temp[["SDNN", "rMSSD", "pNN50"]] = temp[["SDNN", "rMSSD", "pNN50"]] + noise
    augmented.append(temp)

augmented = pd.concat(augmented, ignore_index=True)

# 4. Simpan hasil
augmented.to_csv("breast-cancer-dataset-augmented.csv", index=False)

print("Dataset akhir hasil augmentasi:", augmented.shape)
print("Distribusi kelas:")
print(augmented["Class"].value_counts())


Data asli: (37, 4)  | Distribusi kelas:
Class
1    19
0    18
Name: count, dtype: int64
Setelah SMOTE: (38, 4)  | Distribusi kelas:
Class
1    19
0    19
Name: count, dtype: int64
Dataset akhir hasil augmentasi: (380, 4)
Distribusi kelas:
Class
1    190
0    190
Name: count, dtype: int64
