In [None]:
import ast
import math

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

RANDOM_SEED = 42

In [None]:
file_name = "arcr_cleaned.csv"
df = pd.read_csv(file_name).drop(columns=["cancer 및 기타"])

In [None]:
columns = list(df.columns)

In [None]:
static_columns = columns[:15]
static_columns

In [None]:
seq_columns = columns[15:-8]

In [None]:
print(seq_columns[0:12])
print(seq_columns[12:19])
print(seq_columns[19:31])
print(seq_columns[31:43])
print(seq_columns[43:])

In [None]:
goutallier_columns = columns[-8:]
goutallier_columns

In [None]:
len(columns) == len(static_columns) + len(seq_columns) + len(goutallier_columns)

In [None]:
object_columns = columns[:3] + columns[10:11]
object_columns

In [None]:
integer_columns = columns[3:8] + columns[11:12]
integer_columns

In [None]:
float_columns = columns[8:10] + columns[12:]
float_columns

In [None]:
len(columns) == len(object_columns) + len(integer_columns) + len(float_columns)

In [None]:
label_column = "POD 6M retear"
pos_indices = np.where(df[label_column] == 1)[0]
neg_indices = np.where(df[label_column] == 0)[0]
len(pos_indices), len(neg_indices)

In [None]:
size = 50
test_pos_indices = np.random.choice(pos_indices, size=size, replace=False)
test_neg_indices = np.random.choice(neg_indices, size=size, replace=False)
test_indices = np.concatenate([test_pos_indices, test_neg_indices])

In [None]:
all_indices = np.arange(len(df))
trainval_indices = np.setdiff1d(all_indices, test_indices)

In [None]:
test_df = df.iloc[test_indices].copy()

In [None]:
train_val_split_ratio = 0.15
trainval_df = df.iloc[trainval_indices]
train_df, val_df, _, _ = train_test_split(
  trainval_df,
  trainval_df,
  test_size=train_val_split_ratio,
  random_state=RANDOM_SEED,
  stratify=trainval_df[label_column]
)

In [None]:
train_df[label_column].value_counts()

In [None]:
val_df[label_column].value_counts()

In [None]:
test_df[label_column].value_counts()

In [None]:
smoke_column = "흡연여부 (비흡연:1,흡연:2)"
smoke_pipe = Pipeline(steps=[
  ("cat_imputer", SimpleImputer(strategy="median", add_indicator=True))
])

smoke_pipe.fit_transform(train_df[[smoke_column]])

In [None]:
goutallier_pipe = Pipeline(steps=[
  ("num_imputer", SimpleImputer(strategy="median", add_indicator=True))
])

goutallier_pipe.fit_transform(train_df[goutallier_columns])

In [None]:
hospital_column = "소속병원"
hospital_pipe = Pipeline(steps=[
  ("one_hot", OneHotEncoder(sparse_output=False))
])

hospital_encoded = hospital_pipe.fit_transform(train_df[[hospital_column]])
_, hospital_encoded_dim= hospital_encoded.shape
hospital_encoded

In [None]:
disease_column = "DM:1,HTN:2,CHD:3,CVA:4,dyslipidemia:5,Hyperthyroidism:6,Hypothyroidism:7,Osteoporosis:8,Cancer이름기타이름"
def parse_list(s):
  if not isinstance(s, str) and math.isnan(s):
    return []
  return ast.literal_eval(f"[{s}]")

MultiLabelBinarizer().fit_transform(train_df[disease_column].apply(parse_list))[:20]

In [None]:
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.mlb_ = MultiLabelBinarizer()

  def _to_series_of_lists(self, X):
    if isinstance(X, pd.DataFrame):
      if X.shape[1] != 1:
        raise ValueError("Expect single column for multilabel feature")
      X = X.iloc[:, 0]
    elif isinstance(X, np.ndarray):
      if X.ndim == 2 and X.shape[1] == 1:
        X = X[:, 0]
    return X

  def fit(self, X, y=None):
    X = self._to_series_of_lists(X)
    self.mlb_.fit(X)
    return self

  def transform(self, X):
    X = self._to_series_of_lists(X)
    return self.mlb_.transform(X)

  def get_feature_names_out(self, input_features=None):
    return np.array([f"value_{v}" for v in self.mlb_.classes_], dtype=object)

In [None]:
to_list = FunctionTransformer(lambda col: col.apply(parse_list), validate=False)
disease_pipe = Pipeline(steps=[
  ("to_list", to_list),
  ("multi_bin", MultiLabelBinarizerTransformer())
])

disease_encoded = disease_pipe.fit_transform(train_df[disease_column])
_, disease_encoded_dim= disease_encoded.shape
disease_encoded

In [None]:
smoke_pipe = Pipeline(steps=[
  ("cat_imputer", SimpleImputer(strategy="median", add_indicator=True))
])

goutallier_pipe = Pipeline(steps=[
  ("num_imputer", SimpleImputer(strategy="median", add_indicator=True))
])

hospital_pipe = Pipeline(steps=[
  ("one_hot", OneHotEncoder(sparse_output=False))
])

to_list = FunctionTransformer(
  lambda col: col.apply(parse_list),
  validate=False,
  feature_names_out="one-to-one"
)
disease_pipe = Pipeline(steps=[
  ("to_list", to_list),
  ("multi_bin", MultiLabelBinarizerTransformer())
])

column_pipe = ColumnTransformer(
  [
    ("smoke_pipe", smoke_pipe, [smoke_column]),
    ("goutallier_pipe", goutallier_pipe, goutallier_columns),
    ("hospital_pipe", hospital_pipe, [hospital_column]),
    ("disease_pipe" , disease_pipe, disease_column),
  ],
  remainder="passthrough"
)

In [None]:
encoded_train = column_pipe.fit_transform(train_df)
encoded_val = column_pipe.transform(val_df)
encoded_test = column_pipe.transform(test_df)

In [None]:
target_columns = [smoke_column] + goutallier_columns + [hospital_column] + [disease_column]
target_columns

In [None]:
def columns_with_missing_flag(columns):
  missing_flags = [f"{column} Missing flag" for column in columns]
  return columns + missing_flags

hospital_one_hot_columns = [f"Hospital {i}" for i in range(hospital_encoded_dim)]
disease_multi_bin_columns = [f"Disease {i}" for i in range(disease_encoded_dim)]

processed_columns = []
processed_columns += columns_with_missing_flag([smoke_column]) + columns_with_missing_flag(goutallier_columns)
processed_columns += hospital_one_hot_columns + disease_multi_bin_columns
processed_columns

In [None]:
for a, b in zip(processed_columns, column_pipe.get_feature_names_out()):
  print(a, " : ", b)

In [None]:
passthrough_columns = [column for column in columns if column not in target_columns]
passthrough_columns

In [None]:
encoded_train.shape[-1] == len(processed_columns + passthrough_columns)

In [None]:
processed_passthrough_columns = processed_columns+passthrough_columns
encoded_train_df = pd.DataFrame(encoded_train, columns=processed_passthrough_columns)
encoded_val_df = pd.DataFrame(encoded_val, columns=processed_passthrough_columns)
encoded_test_df = pd.DataFrame(encoded_test, columns=processed_passthrough_columns)

In [None]:
encoded_columns = list(encoded_train_df.columns)

In [None]:
encoded_static_columns = encoded_columns[33:45] + encoded_columns[:2] + encoded_columns[18:33]
encoded_static_columns

In [None]:
encoded_seq_columns = encoded_columns[45:]
encoded_seq_columns

In [None]:
encoded_goutallier_columns = encoded_columns[2:18]
encoded_goutallier_columns

In [None]:
len(encoded_columns) == len(encoded_static_columns) + len(encoded_seq_columns) + len(encoded_goutallier_columns)

In [None]:
reordered_columns = encoded_static_columns + encoded_seq_columns + encoded_goutallier_columns
def reorder_column(df):
  df = df[reordered_columns]
  df.columns = reordered_columns
  return df

encoded_train_df = reorder_column(encoded_train_df)
encoded_val_df = reorder_column(encoded_val_df)
encoded_test_df = reorder_column(encoded_test_df)

In [None]:
encoded_train_df[label_column].value_counts()

In [None]:
encoded_val_df[label_column].value_counts()

In [None]:
encoded_test_df[label_column].value_counts()

In [None]:
scale_target_columns = float_columns[5:] + float_columns[:5]
scale_target_columns

In [None]:
scaled_train_df = encoded_train_df.copy()
scaled_val_df = encoded_val_df.copy()
scaled_test_df = encoded_test_df.copy()

scaler = StandardScaler()
scaled_train_df[scale_target_columns] = scaler.fit_transform(scaled_train_df[scale_target_columns])
scaled_val_df[scale_target_columns] = scaler.transform(encoded_val_df[scale_target_columns])
scaled_test_df[scale_target_columns] = scaler.transform(encoded_test_df[scale_target_columns])

In [None]:
scaled_train_df.to_csv("train.csv", index=False)
scaled_val_df.to_csv("val.csv", index=False)
scaled_test_df.to_csv("test.csv", index=False)

In [None]:
import joblib

joblib.dump(scaler, "scaler.pkl")

In [None]:
loaded_scaler = joblib.load("scaler.pkl")

In [None]:
loaded_scaler.get_feature_names_out()