In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import keras
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras import layers
from keras.layers import Layer, BatchNormalization, Activation, InputLayer
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.layers import RandomContrast, RandomFlip, RandomRotation, RandomZoom
from keras.layers import Concatenate, GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.layers import Resizing, Rescaling
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam, AdamW, SGD
from keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import L2, L1
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df_train_raw = pd.read_csv("dataset/spaceship-titanic/train.csv")  # (8693, 14)
df_test_raw = pd.read_csv("dataset/spaceship-titanic/test.csv")  # (4277, 13)
# preprocess
df_test_raw["Transported"] = False  # (4277, 14)

In [None]:
df_total = pd.concat([df_train_raw, df_test_raw])  # (12970, 14)

In [None]:
# def print_col_features(df_total, selected_col):
#     print(f"\nAbout {selected_col}")
#     print(f"is unique: {df_total[selected_col].is_unique}")
#     if df_total[selected_col].is_unique is False:
#         print(f"values: {df_total[selected_col].value_counts()}")
#         print(f"desc: {df_total[selected_col].describe()}")

In [None]:
feature_cols = [
    "PassengerId",  # unique,
    "HomePlanet",  # Earth     6865 Europa    3133 Mars      2684
    "CryoSleep",  # False    8079 True     4581
    "Cabin",  # cnt: 9825
    "Destination",  # TRAPPIST-1e      8871  55 Cancri e      2641 PSO J318.5-22    1184
    "Age",  # cnt: 80
    "VIP",  # False    12401 True       273
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    "Name",
    "Transported",
]

In [None]:
# Cabin distributing
cabin_values = df_total["Cabin"].str.split("/", expand=True)
df_total["Cabin_deck"] = cabin_values[0]  # A B C D E F G T
df_total = df_total.drop(columns=["Cabin"])  # drop Cabin

In [None]:
# abcd 대신 0,1,2,3,4,5,6,7로 매핑
deck_mapping = {
    deck: idx
    for idx, deck in enumerate(sorted(df_total["Cabin_deck"].dropna().unique()))
}
df_total["Cabin_deck"] = df_total["Cabin_deck"].map(deck_mapping)  # map to int
df_total["Cabin_deck"] = df_total["Cabin_deck"].fillna(-1)  # fillna with -1
df_total["Cabin_deck"] = df_total["Cabin_deck"].astype(int)

df_total.head(20)

In [None]:
df_total.columns.duplicated()

In [None]:
# from md_util import write_summary_md

# write_summary_md(df_total)

In [None]:
# log 스케일이 필요한 칼럼들 끼리, 실제 상관이 있는지 궁금
feature_log_scale = [
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
]

In [None]:
def draw_log1p_corr(df, cols):
    cols = feature_log_scale
    X = df[cols].copy()

    # 결측 처리(상관 계산을 위해 임시로)
    X = X.fillna(0)

    X_log = np.log1p(X)

    corr = X_log.corr(method="pearson")  # 또는 spearman
    plt.figure(figsize=(6, 5))
    sns.heatmap(corr, annot=True, fmt=".2f", square=True)
    plt.title("Correlation (log1p)")
    plt.show()

In [None]:
draw_log1p_corr(df_total, feature_log_scale)

In [None]:
cols = feature_log_scale
display(df_total[cols].isna().sum())
(df_total[cols].isna().sum(axis=1) == len(cols)).mean()  # 5개 전부 NaN 비율

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

cols = feature_log_scale

X = df_total[cols].copy()

# 1) log1p (NaN은 그대로 유지)
X_log = np.log1p(X)

# 2) MICE imputation (log space)
imp = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=20,
    random_state=42,
)

X_log_imp = pd.DataFrame(imp.fit_transform(X_log), columns=cols, index=X.index)

# 3) inverse transform + 음수 방지
X_imp = np.expm1(X_log_imp).clip(lower=0)

# 4) 원본에 반영 (결측인 곳만 덮어쓰기)
df_total[cols] = df_total[cols].where(~df_total[cols].isna(), X_imp)

In [None]:
# display(df_train_raw.shape)
# display(df_train_raw.head())
# display(df_test_raw.shape)
# display(df_test_raw.head())

In [None]:
categorical_cols = ["HomePlanet", "CryoSleep", "VIP", "Destination"]
drop_cols = ["PassengerId", "Name"]
df_total_drops = df_total.drop(columns=drop_cols)
df_total_filled = df_total_drops.fillna(0)
# one hot encoding
df_onehot_encoded = pd.get_dummies(
    df_total_filled, columns=categorical_cols, drop_first=False
)
display("after : one hot encoding")
display(df_onehot_encoded.head())
display("NaN count total:", df_onehot_encoded.isna().value_counts())

target_col = "Transported"  # 실제 라벨 컬럼명

y = df_onehot_encoded[target_col].astype("float32").to_numpy()
X = df_onehot_encoded.drop(columns=[target_col]).astype("float32").to_numpy()

In [None]:
CONFIGURATION = {
    "TRAIN_CNT": 8693,
    "TEST_CNT": 4277,
    "TRAIN_RATIO": 0.8,
    "VAL_RATIO": 0.2,
    "RANDOM_SEED": 44,
    "SHUFFLE": True,
    "BATCH_SIZE": 32,
    "EPOCHS": 100,
    "INITIAL_LEARNING_RATE": 1e-3,
    "PATIENCE": 10,
}
DATASET_SIZE = (len(df_onehot_encoded),)

In [None]:
# split train, test
tensor_data = tf.constant(df_onehot_encoded, dtype=tf.float32)

In [None]:
def split_xy(tensor_data):
    X = tensor_data[:, 0:-1]
    y = tensor_data[:, -1]

    display(f"Shape of X: {X.shape}")
    display(f"Shape of y: {y.shape}")
    return X, y


def make_datasets(X, y):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if CONFIGURATION["SHUFFLE"]:
        ds = ds.shuffle(buffer_size=len(y), seed=CONFIGURATION["RANDOM_SEED"])

    return ds.batch(CONFIGURATION["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)


def split_train_val_test(X, y):
    total = int(X.shape[0])
    val_n = int(CONFIGURATION["TRAIN_CNT"] * CONFIGURATION["VAL_RATIO"])
    test_n = CONFIGURATION["TEST_CNT"]
    train_n = CONFIGURATION["TRAIN_CNT"] - val_n

    X_train, y_train = X[:train_n], y[:train_n]
    X_rest, y_rest = X[train_n:], y[train_n:]
    X_val, y_val = X_rest[:val_n], y_rest[:val_n]
    X_test, y_test = X_rest[val_n:], y_rest[val_n:]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
X, y = split_xy(tensor_data)
X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test(X, y)
train_ds = make_datasets(X_train, y_train)
val_ds = make_datasets(X_val, y_val)
test_ds = make_datasets(X_test, y_test)

Metrics

In [None]:
def mae_raw_from_log(y_true_log, y_pred_log):
    y_true_raw = tf.math.expm1(y_true_log)
    y_pred_raw = tf.math.expm1(y_pred_log)
    y_pred_raw = tf.maximum(y_pred_raw, 0.0)
    return tf.reduce_mean(tf.abs(y_true_raw - y_pred_raw))

Making Model

In [None]:
D = int(X.shape[1])

inputs = keras.Input(shape=(D,), dtype=tf.float32)
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1)(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer=keras.optimizers.AdamW(1e-3),  # type: ignore
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[
        keras.metrics.BinaryAccuracy(name="accuracy"),
        keras.metrics.AUC(name="auc"),
    ],
)
callbacks = (
    [
        keras.callbacks.TerminateOnNaN(),
        # Early stopping
        EarlyStopping(
            monitor="val_loss",
            patience=CONFIGURATION["PATIENCE"],
            restore_best_weights=True,
        ),
        # Model checkpoint
        ModelCheckpoint(
            filepath="best_model.h5",
            monitor="val_loss",
            save_best_only=True,
            save_weights_only=False,
        ),
    ],
)
# 3) 학습
history = model.fit(train_ds, validation_data=val_ds, epochs=50, callbacks=callbacks)

Evaluation

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val_loss"])
plt.show()

Predict

In [None]:
# 예측
predict_result = model.predict(X_test)
predict_result = tf.sigmoid(predict_result).numpy().flatten()
predict_result_binary = predict_result >= 0.5

submission_df = df_test_raw[["PassengerId"]].copy()
submission_df["Transported"] = predict_result_binary
submission_df.to_csv("spaceship_titanic_submission.csv", index=False)