In [None]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTENC

RANDOM_SEED = 42

In [None]:
train_file_name = "train.csv"

In [None]:
train_df = pd.read_csv(train_file_name)
train_df

In [None]:
label_column = "POD 6M retear"
drop_columns = ["일련번호", "수술일자", "POD 2M retear (no:0,retear:1)"]
X_drop_columns = drop_columns + [label_column]

In [None]:
X_train = train_df.drop(columns=X_drop_columns)
y_train = train_df[[label_column]]

In [None]:
# smote = SMOTE(random_state=RANDOM_SEED)
# X_train, y_train = smote.fit_resample(train_df_X, train_df_y)

In [None]:
columns = list(X_train.columns)

In [None]:
hospital_onehot_columns = [column for column in columns if "Hospital" in column]
hospital_onehot_columns

In [None]:
onehot_groups = {"Hospital": hospital_onehot_columns}
onehot_groups

In [None]:
X_onehot_cat_train_dict = {}
for base, cols in onehot_groups.items():
  g = X_train[cols].to_numpy() # [batch, dim]
  all_zeros = (g.sum(axis=-1) == 0) # [batch]
  labels = g.argmax(axis=-1).astype(int) # [batch]
  labels = np.where(all_zeros, -1, labels)
  X_onehot_cat_train_dict[base] = labels

In [None]:
X_onehot_cat_train = pd.DataFrame(X_onehot_cat_train_dict)
X_onehot_cat_train

In [None]:
num_columns = columns[:1] + columns[3:8] + columns[25:-8]
num_columns

In [None]:
cat_columns = [col for col in columns if col not in num_columns and "Hospital" not in col]
cat_columns

In [None]:
X_num_train = X_train[num_columns]
X_num_train

In [None]:
X_cat_train = pd.concat([X_train[cat_columns], X_onehot_cat_train], axis=1)
X_cat_train

In [None]:
X_ordinal_train = pd.concat([X_num_train, X_cat_train], axis=1)
X_ordinal_train

In [None]:
n_num_columns = len(X_num_train.columns)
n_cat_columns = len(X_cat_train.columns)
cat_idx = list(range(n_num_columns, n_num_columns + n_cat_columns))
cat_idx

In [None]:
smote_nc = SMOTENC(categorical_features=cat_idx, random_state=RANDOM_SEED)
X_res_train, y_res_train = smote_nc.fit_resample(X_ordinal_train, y_train)

In [None]:
X_res_train

In [None]:
y_res_train

In [None]:
X_res_num_train = X_res_train[num_columns]
X_res_num_train

In [None]:
X_res_cat_train = X_res_train[cat_columns]
X_res_cat_train

In [None]:
X_res_train[["Hospital"]]

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(sparse_output=False)
hospital_onehot_encoded = onehot.fit_transform(X_res_train[["Hospital"]])

X_res_onehot_cat_train = pd.DataFrame(hospital_onehot_encoded, columns=hospital_onehot_columns)
X_res_onehot_cat_train

In [None]:
X_train_aug = pd.concat([X_res_num_train, X_res_cat_train, X_res_onehot_cat_train], axis=1)[columns]
X_train_aug

In [None]:
X_train_aug.to_csv("X_train.csv", index=False)
y_res_train.to_csv("y_train.csv", index=False)

In [None]:
orig_train_df = pd.read_csv("train.csv")

X_train_orig = orig_train_df.drop(columns=X_drop_columns)
y_train_orig = orig_train_df[[label_column]]
X_train_orig.to_csv("X_train_orig.csv", index=False)
y_train_orig.to_csv("y_train_orig.csv", index=False)

In [None]:
val_df = pd.read_csv("val.csv")

X_val = val_df.drop(columns=X_drop_columns)
y_val = val_df[[label_column]]
X_val.to_csv("X_val.csv", index=False)
y_val.to_csv("y_val.csv", index=False)

In [None]:
test_df = pd.read_csv("test.csv")

X_test = test_df.drop(columns=X_drop_columns)
y_test = test_df[[label_column]]
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)