# Notebook 기본 세팅

In [1]:
# Constant 선언

# 프로젝트 루트 디렉토리를 식별하기 위한 마커 파일 이름
ROOT_MARKER = "pyproject.toml"

# 한글 표시를 위한 나눔바른고딕 폰트 파일 이름
# matplotlib 의 font_manager 에 실제 폰트 파일의 위치를 넣어주어야 한다.
KOREAN_FONT_FILE = "NanumBarunGothic.ttf"

# matplotlib 에서는 font-family 의 이름으로 font 를 설정한다.
# 그래서 font 파일 그 자체가 아니라, 그 파일의 family 이름을 적어준다.
KOREAN_FONT_FAMILY = "NanumBarunGothic"

# 참고
# Font Family 와 Font File 의 차이는,
# Font Family 는 비슷한 디자인 특성을 공유하는 글꼴 그룹을 의미한다.
#
# 예를 들어 '나눔바른고딕' 폰트 패밀리는 일반(Regular), 굵게(Bold), 기울임(Italic) 등 여러 스타일을 포함할 수 있다.
# 반면, 폰트 파일(.ttf, .otf 등)은 이러한 폰트의 하나의 스타일이 저장된 실제 파일이다.
#
# 이 프로젝트에서는 폰트 용량을 줄이기 위해 일반(Regular) 인 NanumBarunGothic.ttf 만 사용한다.

In [2]:
# 프로젝트 root 를 sys.path 에 추가해서 import 구문을 사용하기 쉽게
from pathlib import Path


def find_project_root() -> Path:
    """
    pyproject.toml 파일을 기준으로 루트 디렉토리를 찾는다.
    :return: Path: 프로젝트 루트 디렉토리 경로
    """

    current_path = Path().resolve()

    while current_path != current_path.parent:
        if (current_path / ROOT_MARKER).exists():
            return current_path

        current_path = current_path.parent

    raise FileNotFoundError("프로젝트 루트 디렉토리를 찾을 수 없습니다.")


ROOT_DIR = find_project_root()

In [3]:
# matplotlib 의 한글 font 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt


FONTS_DATA_DIR = ROOT_DIR / "notebooks" / "fonts"


def setup_korean_font():
    font_path = FONTS_DATA_DIR / KOREAN_FONT_FILE
    fm.fontManager.addfont(font_path)

    # 폰트 설정
    plt.rcParams["font.family"] = KOREAN_FONT_FAMILY
    plt.rcParams["axes.unicode_minus"] = False


setup_korean_font()

# Automated Pipeline 에 필요한 함수들 구현해보기

- data 준비
- wandb 통합
- 여러 모델 학습 가능하도록 하기
- 학습된 모델 평가 지표 뽑기
- 학습된 모델 평가하기
- 최종 모델 선택하기

## Dataset 준비

In [4]:
from sklearn.model_selection import train_test_split


FEATURE_DATASET_STORAGE_KEY = "features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/weather-features.csv"
TARGET_COLUMN = "weather"


def prepare_data(random_state: int) -> dict[str, str]:
    """
    dataset 을 훈련(60%), 검증(20%), 테스트(20%)로 나눠서
    Cloud Storage 에 저장하고, 그 key 들을 반환
    """
    import uuid
    from pathlib import Path

    from src.libs.storage import Storage

    storage = Storage.create()
    features = storage.read_as_dataframe(FEATURE_DATASET_STORAGE_KEY)

    x = features.drop(TARGET_COLUMN, axis=1)
    y = features[TARGET_COLUMN]

    # 1단계: 데이터를 훈련 + 검증 세트와 테스트 세트로 분할 (80% 대 20%)
    train_val_x, test_x, train_val_y, test_y = train_test_split(x, y, test_size=0.2, random_state=random_state)

    # 2단계: 훈련 + 검증 세트를 훈련 세트와 검증 세트로 분할 (75% 대 25%)
    train_x, val_x, train_y, val_y = train_test_split(
        train_val_x, train_val_y, test_size=0.25, random_state=random_state
    )

    # 최종적으로, 원래 데이터의 60% 가 훈련 세트, 20% 가 검증 세트, 20% 가 테스트 세트
    dfs = {
        "train_x": train_x,
        "val_x": val_x,
        "test_x": test_x,
        "train_y": train_y,
        "val_y": val_y,
        "test_y": test_y,
    }
    result = {}
    sub_directory = f"{Path(FEATURE_DATASET_STORAGE_KEY).parent.name}/split-{uuid.uuid4()}"
    for dataset_name, df in dfs.items():
        storage_key = storage.upload_feature_df(df, filename=dataset_name, sub_directory=sub_directory)
        result[dataset_name] = storage_key
    return result

In [5]:
dataset_keys = prepare_data(random_state=42)
dataset_keys

[2025-06-05 23:18:49] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/weather-features.csv
[2025-06-05 23:18:51] INFO [src.libs.storage._check_and_log_response] Success to upload features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_x.csv
[2025-06-05 23:18:51] INFO [src.libs.storage._check_and_log_response] Success to upload features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/val_x.csv
[2025-06-05 23:18:52] INFO [src.libs.storage._check_and_log_response] Success to upload features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/test_x.csv
[2025-06-05 23:18:52] INFO [src.libs.storage._check_and_log_response] Success to upload features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_y.csv
[2025-06-05 23:18:52] INFO [src.libs.storage._che

{'train_x': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_x.csv',
 'val_x': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/val_x.csv',
 'test_x': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/test_x.csv',
 'train_y': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_y.csv',
 'val_y': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/val_y.csv',
 'test_y': 'features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/test_y.csv'}

## weights & biases 의 api 사용해보기

In [6]:
from src.evaluation.metrics import evaluate_model
from src.libs.storage import Storage
from src.models.random_forest import RandomForestModel
from src.tracker.wandb import WandbTracker


def poc_wandb():
    storage = Storage.create()
    tracker = WandbTracker.create()

    train_x = storage.read_as_dataframe(dataset_keys["train_x"])
    train_y = storage.read_as_dataframe(dataset_keys["train_y"])
    val_x = storage.read_as_dataframe(dataset_keys["val_x"])
    val_y = storage.read_as_dataframe(dataset_keys["val_y"])

    model_params = RandomForestModel.default_params()

    tracker.start_experiment(
        experiment_name="train-as-random-forest", params=model_params, job_type="training", tags=["weather"]
    )
    model = RandomForestModel(model_params)
    model.fit(train_x, train_y)

    pred_y = model.predict(val_x)
    metrics = evaluate_model(val_y, pred_y)

    tracker.log_metrics(metrics)

    dataset_metadata = {"storage": {"name": "ncloud", "bucket": storage.bucket, "dataset_keys": dataset_keys}}
    model_ref = tracker.register_model(
        model,
        model_name="random-forest",
        metadata={
            "framework": "sklearn",
            "params": model_params,
            "dataset": dataset_metadata,
            "metrics": metrics,
        },
    )
    tracker.end_experiment()
    return model_ref

## train 용 함수 만들기

In [7]:
def train(train_x_key: str, train_y_key: str, experiment_name: str) -> tuple[str, str]:
    from src.libs.storage import Storage
    from src.models.random_forest import RandomForestModel
    from src.tracker.wandb import WandbTracker

    storage = Storage.create()
    train_x = storage.read_as_dataframe(train_x_key)
    train_y = storage.read_as_dataframe(train_y_key).to_numpy().ravel()

    tracker = WandbTracker.create()
    model_params = RandomForestModel.default_params()

    tracker.start_experiment(
        experiment_name=experiment_name,
        params=model_params,
        job_type="training",
    )
    run_id = tracker.get_run_id()

    model = RandomForestModel(model_params)
    model.fit(train_x, train_y)

    model_reference = tracker.register_model(
        model,
        model_name="random-forest",
        metadata={
            "framework": "sklearn",
            "datasets": {
                "storage": {
                    "name": "ncloud",
                    "bucket": storage.bucket,
                    "train_x": train_x_key,
                    "train_y": train_y_key,
                }
            },
            "params": model_params,
        },
    )

    tracker.end_experiment()
    return run_id, model_reference

In [8]:
wandb_run_id, model_artifact_ref = train(
    dataset_keys["train_x"], dataset_keys["train_y"], "random-forest-default-params"
)
wandb_run_id, model_artifact_ref

[2025-06-05 23:18:53] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_x.csv
[2025-06-05 23:18:54] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/train_y.csv
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/joyuiyeong/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoyuiyeong[0m ([33mjandar-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


('qgrnpnp9', 'random-forest:v5')

## evaluate 용 함수 만들기

In [9]:
def evaluate(val_x_key: str, val_y_key: str, run_id: str, model_ref: str) -> tuple[str, str]:
    from src.evaluation.metrics import evaluate_model
    from src.libs.storage import Storage
    from src.tracker.wandb import WandbTracker

    storage = Storage.create()
    val_x = storage.read_as_dataframe(val_x_key)
    val_y = storage.read_as_dataframe(val_y_key).to_numpy().ravel()

    tracker = WandbTracker.create()
    tracker.resume_experiment(run_id, job_type="evaluation")

    model = tracker.load_model(model_ref)
    pred_y = model.predict(val_x)
    metrics = evaluate_model(val_y, pred_y)

    tracker.log_metrics(metrics)

    tracker.end_experiment()

    return run_id, model_ref

In [10]:
wandb_run_id, model_artifact_ref = evaluate(
    dataset_keys["val_x"],
    dataset_keys["val_y"],
    wandb_run_id,
    model_artifact_ref,
)
wandb_run_id, model_artifact_ref

[2025-06-05 23:19:04] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/val_x.csv
[2025-06-05 23:19:04] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/val_y.csv


[34m[1mwandb[0m:   1 of 1 files downloaded.  


0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,0.03055
r2,0.99623
rmse,0.06381


('qgrnpnp9', 'random-forest:v5')

## test 용 함수 만들기

In [11]:
def test(test_x_key: str, test_y_key: str, run_id: str, model_ref: str) -> tuple[str, str]:
    from src.evaluation.metrics import evaluate_model
    from src.libs.storage import Storage
    from src.tracker.wandb import WandbTracker

    storage = Storage.create()
    test_x = storage.read_as_dataframe(test_x_key)
    test_y = storage.read_as_dataframe(test_y_key).to_numpy().ravel()

    tracker = WandbTracker.create()
    tracker.resume_experiment(run_id, job_type="test")

    model = tracker.load_model(model_ref)

    pred_y = model.predict(test_x)
    metrics = evaluate_model(test_y, pred_y)
    tracker.log_metrics(metrics)

    tracker.end_experiment()
    return run_id, model_ref

In [12]:
wandb_run_id, model_artifact_ref = test(
    dataset_keys["test_x"],
    dataset_keys["test_y"],
    wandb_run_id,
    model_artifact_ref,
)
wandb_run_id, model_artifact_ref

[2025-06-05 23:19:59] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/test_x.csv
[2025-06-05 23:19:59] INFO [src.libs.storage._check_and_log_response] Success to read features/20250531-41872183-2035-4ee9-9dc0-fae636081fb8/split-db42ad24-04a9-4a97-a474-09eb5adc2516/test_y.csv


[34m[1mwandb[0m:   1 of 1 files downloaded.  


0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,0.03059
r2,0.99594
rmse,0.06603


('qgrnpnp9', 'random-forest:v5')