In [1]:
# Constant 선언

# 프로젝트 루트 디렉토리를 식별하기 위한 마커 파일 이름
ROOT_MARKER = "pyproject.toml"

# 한글 표시를 위한 나눔바른고딕 폰트 파일 이름
# matplotlib 의 font_manager 에 실제 폰트 파일의 위치를 넣어주어야 한다.
KOREAN_FONT_FILE = "NanumBarunGothic.ttf"

# matplotlib 에서는 font-family 의 이름으로 font 를 설정한다.
# 그래서 font 파일 그 자체가 아니라, 그 파일의 family 이름을 적어준다.
KOREAN_FONT_FAMILY = "NanumBarunGothic"

# 참고
# Font Family 와 Font File 의 차이는,
# Font Family 는 비슷한 디자인 특성을 공유하는 글꼴 그룹을 의미한다.
#
# 예를 들어 '나눔바른고딕' 폰트 패밀리는 일반(Regular), 굵게(Bold), 기울임(Italic) 등 여러 스타일을 포함할 수 있다.
# 반면, 폰트 파일(.ttf, .otf 등)은 이러한 폰트의 하나의 스타일이 저장된 실제 파일이다.
#
# 이 프로젝트에서는 폰트 용량을 줄이기 위해 일반(Regular) 인 NanumBarunGothic.ttf 만 사용한다.

In [2]:
# 프로젝트 root 를 sys.path 에 추가해서 import 구문을 사용하기 쉽게
import sys
from pathlib import Path


def find_project_root() -> Path:
    """
    pyproject.toml 파일을 기준으로 루트 디렉토리를 찾는다.
    :return: Path: 프로젝트 루트 디렉토리 경로
    """

    current_path = Path().resolve()

    while current_path != current_path.parent:
        if (current_path / ROOT_MARKER).exists():
            return current_path

        current_path = current_path.parent

    raise FileNotFoundError("프로젝트 루트 디렉토리를 찾을 수 없습니다.")


ROOT_DIR = find_project_root()

sys.path.append(str(ROOT_DIR))

In [3]:
# matplotlib 의 한글 font 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt


FONTS_DATA_DIR = ROOT_DIR / "notebooks" / "fonts"


def setup_korean_font():
    font_path = FONTS_DATA_DIR / KOREAN_FONT_FILE
    fm.fontManager.addfont(font_path)

    # 폰트 설정
    plt.rcParams["font.family"] = KOREAN_FONT_FAMILY
    plt.rcParams["axes.unicode_minus"] = False


setup_korean_font()

In [4]:
import pandas as pd

from src.data.loader import AsosDataLoader
from src.libs.storage import Storage


storage = Storage.create()

loader = AsosDataLoader(storage)

In [5]:
df_per_station = loader.load()

[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to retrieve 
[2025-06-06 03:21:42] INFO [src.data.loader.load] Result of retrieve_in_datasets
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-100-daegwallyeong.csv
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-101-chuncheon.csv
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-102-baengnyeongdo.csv
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-105-gangneung.csv
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-106-donghae.csv
[2025-06-06 03:21:42] INFO [src.libs.storage._check_and_log_response] Success to read datasets/20250531/20200601-20250531-

In [6]:
total_df = pd.concat(list(df_per_station.values()))
total_df

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,sum_rn_dur,mi10_max_rn,...,avg_m05_te,avg_m10_te,avg_m15_te,avg_m30_te,avg_m50_te,sum_lrg_ev,sum_sml_ev,n99_rn,iscs,sum_fog_dur
0,100,대관령,2020-06-01,16.2,11.3,2244.0,21.1,1502.0,,0.0,...,,,,,,5.7,8.2,,,
1,100,대관령,2020-06-02,15.4,9.1,600.0,20.6,1508.0,,,...,,,,,,3.8,5.4,,,
2,100,대관령,2020-06-03,19.2,13.6,312.0,26.2,1441.0,,,...,,,,,,5.8,8.2,,,
3,100,대관령,2020-06-04,20.4,15.3,402.0,25.5,1512.0,,,...,,,,,,5.9,8.4,,,
4,100,대관령,2020-06-05,15.5,10.3,2352.0,23.3,1102.0,,0.0,...,,,,,,3.0,4.3,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,99,파주,2025-05-27,17.9,10.2,524.0,25.8,1443.0,,,...,,,,,,4.2,5.9,,,
1822,99,파주,2025-05-28,18.6,12.5,538.0,25.9,1440.0,,,...,,,,,,4.2,6.0,,,
1823,99,파주,2025-05-29,19.0,12.8,349.0,26.8,1351.0,,,...,,,,,,4.1,5.8,,,
1824,99,파주,2025-05-30,19.6,12.6,455.0,27.5,1437.0,,,...,,,,,,4.8,6.9,,,


In [7]:
from src.data.imputer import WeatherDataImputer


imputer = WeatherDataImputer()
filled_df = imputer.transform(total_df)
filled_df

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,max_ps_hrmt,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg
0,100,대관령,2020-06-01,22.0,16.8,2244.0,27.8,1502.0,0.0,0.0,...,2243.0,1004.1,1501.0,1009.4,14.4,11.2,4.4,1.8,26.4,6.8
1,100,대관령,2020-06-02,20.8,14.2,600.0,27.5,1508.0,0.0,0.0,...,559.0,1002.7,1716.0,1008.2,14.4,10.5,4.1,1.9,26.4,6.5
2,100,대관령,2020-06-03,21.2,16.7,312.0,25.2,1441.0,0.0,0.0,...,332.0,997.9,1636.0,1005.0,14.4,1.1,8.6,5.5,24.8,13.3
3,100,대관령,2020-06-04,24.7,20.5,402.0,29.8,1512.0,0.0,0.0,...,2344.0,999.3,1513.0,1004.2,14.4,12.1,5.1,0.9,31.4,12.3
4,100,대관령,2020-06-05,23.8,19.0,2352.0,29.7,1102.0,0.0,0.0,...,2338.0,1004.2,54.0,1006.6,14.4,9.9,6.3,0.1,29.4,7.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,99,파주,2025-05-27,18.7,13.1,524.0,25.3,1443.0,0.0,0.0,...,745.0,1016.6,1638.0,1018.7,14.3,11.7,0.0,0.0,26.7,9.9
1822,99,파주,2025-05-28,19.1,14.3,538.0,25.9,1440.0,0.0,0.0,...,735.0,1016.6,1546.0,1018.0,14.3,11.3,2.8,0.1,27.3,12.0
1823,99,파주,2025-05-29,19.2,14.0,349.0,26.7,1351.0,0.0,0.0,...,12.0,1013.2,1729.0,1015.0,14.3,9.7,1.9,1.5,28.6,12.3
1824,99,파주,2025-05-30,18.0,14.3,455.0,24.5,1437.0,0.0,0.0,...,736.0,1011.1,1732.0,1013.0,14.3,11.1,0.6,0.0,27.9,11.5


In [8]:
from src.data.labeler import WeatherLabeler


labeler = WeatherLabeler()
labeled_df = labeler.fit_transform(filled_df)
labeled_df

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg,weather
0,100,대관령,2020-06-01,22.0,16.8,2244.0,27.8,1502.0,0.0,0.0,...,1004.1,1501.0,1009.4,14.4,11.2,4.4,1.8,26.4,6.8,알수없음
1,100,대관령,2020-06-02,20.8,14.2,600.0,27.5,1508.0,0.0,0.0,...,1002.7,1716.0,1008.2,14.4,10.5,4.1,1.9,26.4,6.5,알수없음
2,100,대관령,2020-06-03,21.2,16.7,312.0,25.2,1441.0,0.0,0.0,...,997.9,1636.0,1005.0,14.4,1.1,8.6,5.5,24.8,13.3,흐림
3,100,대관령,2020-06-04,24.7,20.5,402.0,29.8,1512.0,0.0,0.0,...,999.3,1513.0,1004.2,14.4,12.1,5.1,0.9,31.4,12.3,알수없음
4,100,대관령,2020-06-05,23.8,19.0,2352.0,29.7,1102.0,0.0,0.0,...,1004.2,54.0,1006.6,14.4,9.9,6.3,0.1,29.4,7.5,알수없음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,99,파주,2025-05-27,18.7,13.1,524.0,25.3,1443.0,0.0,0.0,...,1016.6,1638.0,1018.7,14.3,11.7,0.0,0.0,26.7,9.9,맑음
1822,99,파주,2025-05-28,19.1,14.3,538.0,25.9,1440.0,0.0,0.0,...,1016.6,1546.0,1018.0,14.3,11.3,2.8,0.1,27.3,12.0,맑음
1823,99,파주,2025-05-29,19.2,14.0,349.0,26.7,1351.0,0.0,0.0,...,1013.2,1729.0,1015.0,14.3,9.7,1.9,1.5,28.6,12.3,맑음
1824,99,파주,2025-05-30,18.0,14.3,455.0,24.5,1437.0,0.0,0.0,...,1011.1,1732.0,1013.0,14.3,11.1,0.6,0.0,27.9,11.5,맑음


In [9]:
from src.data.handler import WeatherDataOutlierHandler


outlier_handler = WeatherDataOutlierHandler()
processed_df = outlier_handler.fit_transform(labeled_df)
processed_df

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg,weather
0,100,대관령,2020-06-01,22.0,16.8,1340.0,27.8,1502.0,0.0,0.0,...,1004.1,1501.0,1009.4,14.4,11.2,4.4,1.8,26.4,6.8,알수없음
1,100,대관령,2020-06-02,20.8,14.2,600.0,27.5,1508.0,0.0,0.0,...,1002.7,1716.0,1008.2,14.4,10.5,4.1,1.9,26.4,6.5,알수없음
2,100,대관령,2020-06-03,21.2,16.7,312.0,25.2,1441.0,0.0,0.0,...,997.9,1636.0,1005.0,14.4,1.1,8.6,5.5,24.8,13.3,흐림
3,100,대관령,2020-06-04,24.7,20.5,402.0,29.8,1512.0,0.0,0.0,...,999.3,1513.0,1004.2,14.4,12.1,5.1,0.9,31.4,12.3,알수없음
4,100,대관령,2020-06-05,23.8,19.0,1340.0,29.7,1102.0,0.0,0.0,...,1004.2,54.0,1006.6,14.4,9.9,6.3,0.1,29.4,7.5,알수없음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,99,파주,2025-05-27,18.7,13.1,524.0,25.3,1443.0,0.0,0.0,...,1016.6,1638.0,1018.7,14.3,11.7,0.0,0.0,26.7,9.9,맑음
1822,99,파주,2025-05-28,19.1,14.3,538.0,25.9,1440.0,0.0,0.0,...,1016.6,1546.0,1018.0,14.3,11.3,2.8,0.1,27.3,12.0,맑음
1823,99,파주,2025-05-29,19.2,14.0,349.0,26.7,1351.0,0.0,0.0,...,1013.2,1729.0,1015.0,14.3,9.7,1.9,1.5,28.6,12.3,맑음
1824,99,파주,2025-05-30,18.0,14.3,455.0,24.5,1437.0,0.0,0.0,...,1011.1,1732.0,1013.0,14.3,11.1,0.6,0.0,27.9,11.5,맑음


In [10]:
processed_df["tm"].head

<bound method NDFrame.head of 0       2020-06-01
1       2020-06-02
2       2020-06-03
3       2020-06-04
4       2020-06-05
           ...    
1821    2025-05-27
1822    2025-05-28
1823    2025-05-29
1824    2025-05-30
1825    2025-05-31
Name: tm, Length: 166156, dtype: object>

In [11]:
from src.data.transformer import WeatherDataTransformer


transformer = WeatherDataTransformer()
features = transformer.fit_transform(processed_df)
features

 np.int64(90)]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  result.loc[non_null_mask, column] = self.label_encoders[column].transform(
 np.int64(489)]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  result.loc[non_null_mask, column] = self.label_encoders[column].transform(
 np.int64(235)]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  result.loc[non_null_mask, column] = self.label_encoders[column].transform(
 np.int64(0)]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  result.loc[non_null_mask, column] = self.label_encoders[column].transform(
 np.int64(0)]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  result.loc[non_null_mask, column] = self.label_encoders[column].transform(
 np.int64(74)]' has dtype incompatible with float64, please explicitly cast to a compatible 

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg,weather
0,0,17,20200601,22.0,16.8,260,27.8,302,0.0,0,...,1004.1,358,1009.4,14.4,11.2,4.4,1.8,26.4,6.8,3
1,0,17,20200602,20.8,14.2,577,27.5,308,0.0,0,...,1002.7,504,1008.2,14.4,10.5,4.1,1.9,26.4,6.5,3
2,0,17,20200603,21.2,16.7,378,25.2,281,0.0,0,...,997.9,463,1005.0,14.4,1.1,8.6,5.5,24.8,13.3,4
3,0,17,20200604,24.7,20.5,438,29.8,312,0.0,0,...,999.3,371,1004.2,14.4,12.1,5.1,0.9,31.4,12.3,3
4,0,17,20200605,23.8,19.0,260,29.7,62,0.0,0,...,1004.2,1169,1006.6,14.4,9.9,6.3,0.1,29.4,7.5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,90,83,20250527,18.7,13.1,533,25.3,283,0.0,0,...,1016.6,465,1018.7,14.3,11.7,0.0,0.0,26.7,9.9,1
1822,90,83,20250528,19.1,14.3,548,25.9,280,0.0,0,...,1016.6,407,1018.0,14.3,11.3,2.8,0.1,27.3,12.0,1
1823,90,83,20250529,19.2,14.0,418,26.7,231,0.0,0,...,1013.2,517,1015.0,14.3,9.7,1.9,1.5,28.6,12.3,1
1824,90,83,20250530,18.0,14.3,496,24.5,277,0.0,0,...,1011.1,520,1013.0,14.3,11.1,0.6,0.0,27.9,11.5,1


In [12]:
features["tm"].head

<bound method NDFrame.head of 0       20200601
1       20200602
2       20200603
3       20200604
4       20200605
          ...   
1821    20250527
1822    20250528
1823    20250529
1824    20250530
1825    20250531
Name: tm, Length: 166156, dtype: int64>

In [13]:
features_copy = features.copy()
features_copy.head()

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg,weather
0,0,17,20200601,22.0,16.8,260,27.8,302,0.0,0,...,1004.1,358,1009.4,14.4,11.2,4.4,1.8,26.4,6.8,3
1,0,17,20200602,20.8,14.2,577,27.5,308,0.0,0,...,1002.7,504,1008.2,14.4,10.5,4.1,1.9,26.4,6.5,3
2,0,17,20200603,21.2,16.7,378,25.2,281,0.0,0,...,997.9,463,1005.0,14.4,1.1,8.6,5.5,24.8,13.3,4
3,0,17,20200604,24.7,20.5,438,29.8,312,0.0,0,...,999.3,371,1004.2,14.4,12.1,5.1,0.9,31.4,12.3,3
4,0,17,20200605,23.8,19.0,260,29.7,62,0.0,0,...,1004.2,1169,1006.6,14.4,9.9,6.3,0.1,29.4,7.5,3


In [14]:
features_copy["year"] = pd.to_datetime(features_copy["tm"], format="%Y%m%d").dt.year
features_copy["year"].head()

0    2020
1    2020
2    2020
3    2020
4    2020
Name: year, dtype: int32

In [15]:
from data.selection import Spliter


prep = Spliter()

In [16]:
train_df, val_df, test_df = prep.split(features_copy)

[2025-06-06 03:27:50] INFO [src.models.prepared_data.split_dataset] Data split - Train: 85902, Val: 33210, Test: 33303


In [17]:
features.head()

Unnamed: 0,stn_id,stn_nm,tm,avg_ta,min_ta,min_ta_hrmt,max_ta,max_ta_hrmt,mi10_max_rn,mi10_max_rn_hrmt,...,min_ps,min_ps_hrmt,avg_ps,ss_dur,sum_ss_hr,avg_tca,avg_lmac,avg_ts,min_tg,weather
0,0,17,20200601,22.0,16.8,260,27.8,302,0.0,0,...,1004.1,358,1009.4,14.4,11.2,4.4,1.8,26.4,6.8,3
1,0,17,20200602,20.8,14.2,577,27.5,308,0.0,0,...,1002.7,504,1008.2,14.4,10.5,4.1,1.9,26.4,6.5,3
2,0,17,20200603,21.2,16.7,378,25.2,281,0.0,0,...,997.9,463,1005.0,14.4,1.1,8.6,5.5,24.8,13.3,4
3,0,17,20200604,24.7,20.5,438,29.8,312,0.0,0,...,999.3,371,1004.2,14.4,12.1,5.1,0.9,31.4,12.3,3
4,0,17,20200605,23.8,19.0,260,29.7,62,0.0,0,...,1004.2,1169,1006.6,14.4,9.9,6.3,0.1,29.4,7.5,3


In [None]:
X_train, y_train = prep.prepare_features_target(train_df)
X_val, y_val = prep.prepare_features_target(val_df)
X_test, y_test = prep.prepare_features_target(test_df)

In [None]:
import time

import lightgbm as lgb
import numpy as np
import pandas as pd
import wandb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit


class Trainer:
    """LightGBM, XGBoost, RandomForest 모델을 모두 학습하는 클래스"""

    def __init__(self):
        self.model_defs = {
            "lightgbm": lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1),
            "randomforest": RandomForestClassifier(n_estimators=100, random_state=42),
            "xgboost": xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric="mlogloss"),
        }

    def train_all_models(self, X_train: pd.DataFrame, y_train: pd.Series) -> dict:
        """모든 모델 학습 + CV 결과 및 메타데이터 반환"""
        tscv = TimeSeriesSplit(n_splits=2)
        trained_models = {}

        for model_name, model in self.model_defs.items():
            print(f"Training model: {model_name}")
            start_time = time.time()
            cv_scores = []

            # Cross Validation
            for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                model.fit(X_tr, y_tr)
                preds = model.predict(X_val)
                f1 = f1_score(y_val, preds, average="macro")

                wandb.log({f"{model_name}_fold{fold + 1}_f1": f1})
                cv_scores.append(f1)

            cv_mean = np.mean(cv_scores)
            cv_std = np.std(cv_scores)

            print(f"{model_name} - CV Mean F1: {cv_mean:.4f}, Std: {cv_std:.4f}")

            # 전체 데이터로 재학습
            model.fit(X_train, y_train)

            elapsed_time = round(time.time() - start_time, 2)

            wandb.log(
                {
                    f"{model_name}_cv_mean_f1": cv_mean,
                    f"{model_name}_cv_std_f1": cv_std,
                    f"{model_name}_train_time_sec": elapsed_time,
                }
            )

            # 모델 + 메타데이터 저장
            trained_models[model_name] = {
                "model": model,
                "cv_scores": cv_scores,
                "cv_mean": cv_mean,
                "cv_std": cv_std,
                "train_time_sec": elapsed_time,
                "model_type": type(model).__name__,
                "retrained": True,
            }

        return trained_models

In [None]:
trainer = Trainer()
trained_models = trainer.train_all_models(X_train, y_train)