In [None]:
import kagglehub
import os
from pathlib import Path
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from keras import layers
from keras.models import Model
from keras.layers import Layer, BatchNormalization, Activation, InputLayer
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam
from keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import L2, L1

Prepare Dataset

In [None]:
# Download latest version
path = kagglehub.dataset_download(
    "harishkumardatalab/medical-insurance-price-prediction"
)

print("Path to dataset files:", path)

In [None]:
root = Path(path)
csv_name = "Medical_insurance.csv"

dataset_path = Path.joinpath(root, csv_name)
print("다운로드 경로:", dataset_path)

df = pd.read_csv(dataset_path)
display(df.head())
display(df.shape)
display(df.isnull().value_counts())

Visualization

In [None]:
# sns.pairplot(df[df.columns], diag_kind="kde")

In [None]:
categorical_cols = ["sex", "region", "smoker"]

# one hot encoding
df_onehot_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
display("after : one hot encoding")
display(df_onehot_encoded.head())

# column 정렬 (charges 맨 뒤로)
col_charges = df_onehot_encoded["charges"]
df_onehot_encoded = df_onehot_encoded.drop(columns=["charges"])
df_onehot_encoded = pd.concat([df_onehot_encoded, col_charges], axis=1)
display("after : column 재정의")
display(df_onehot_encoded.head())

In [None]:
CONFIGURATION = {
    "TRAIN_RATIO": 0.7,
    "VAL_RATIO": 0.15,
    "TEST_RATIO": 0.15,
    "RANDOM_SEED": 44,
    "SHUFFLE": True,
    "BATCH_SIZE": 32,
    "EPOCHS": 100,
    "INITIAL_LEARNING_RATE": 1e-3,
    "PATIENCE": 10,
}
DATASET_SIZE = (len(df_onehot_encoded),)

In [None]:
# split train, test
df_onehot_encoded["charges"] = np.log1p(df_onehot_encoded["charges"].astype("float32"))

tensor_data = tf.constant(df_onehot_encoded, dtype=tf.float32)

In [None]:
def split_xy(tensor_data):
    X = tensor_data[:, 0:-1]
    y = tensor_data[:, -1]

    display(f"Shape of X: {X.shape}")
    display(f"Shape of y: {y.shape}")
    return X, y


def make_datasets(X, y):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if CONFIGURATION["SHUFFLE"]:
        ds = ds.shuffle(buffer_size=len(y), seed=CONFIGURATION["RANDOM_SEED"])

    return ds.batch(CONFIGURATION["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)


def split_train_val_test(X, y):
    total = int(X.shape[0])
    val_n = int(total * CONFIGURATION["VAL_RATIO"])
    test_n = int(total * CONFIGURATION["TEST_RATIO"])
    train_n = total - val_n - test_n

    X_train, y_train = X[:train_n], y[:train_n]
    X_rest, y_rest = X[train_n:], y[train_n:]
    X_val, y_val = X_rest[:val_n], y_rest[:val_n]
    X_test, y_test = X_rest[val_n:], y_rest[val_n:]

    train_ds = make_datasets(X_train, y_train)
    val_ds = make_datasets(X_val, y_val)
    test_ds = make_datasets(X_test, y_test)
    return train_ds, val_ds, test_ds

In [None]:
X, y = split_xy(tensor_data)
train_ds, val_ds, test_ds = split_train_val_test(X, y)

Metrics

In [None]:
def mae_raw_from_log(y_true_log, y_pred_log):
    y_true_raw = tf.math.expm1(y_true_log)
    y_pred_raw = tf.math.expm1(y_pred_log)
    y_pred_raw = tf.maximum(y_pred_raw, 0.0)
    return tf.reduce_mean(tf.abs(y_true_raw - y_pred_raw))

Making Model

In [None]:
D = int(X.shape[1])

inputs = keras.Input(shape=(D,), dtype=tf.float32)
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1)(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[keras.metrics.MeanAbsoluteError(name="mae")],
)

# 3) 학습
history = model.fit(train_ds, validation_data=val_ds, epochs=50)

Evaluation

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val_loss"])
plt.show()

In [None]:
def get_pred_raw(model, ds):
    y_pred = model.predict(ds)
    return np.expm1(y_pred)


def get_pred_eval(model, ds):
    return model.evaluate(ds)


def get_mae_rmse(y_pred, y_true):
    mae = np.mean(np.abs(y_pred - y_true))
    rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
    return mae, rmse

In [None]:
get_pred_eval(model, test_ds)