# Notebook 기본 세팅

In [2]:
# Constant 선언

# 프로젝트 루트 디렉토리를 식별하기 위한 마커 파일 이름
ROOT_MARKER = "pyproject.toml"

# 한글 표시를 위한 나눔바른고딕 폰트 파일 이름
# matplotlib 의 font_manager 에 실제 폰트 파일의 위치를 넣어주어야 한다.
KOREAN_FONT_FILE = "NanumBarunGothic.ttf"

# matplotlib 에서는 font-family 의 이름으로 font 를 설정한다.
# 그래서 font 파일 그 자체가 아니라, 그 파일의 family 이름을 적어준다.
KOREAN_FONT_FAMILY = "NanumBarunGothic"

# 참고
# Font Family 와 Font File 의 차이는,
# Font Family 는 비슷한 디자인 특성을 공유하는 글꼴 그룹을 의미한다.
#
# 예를 들어 '나눔바른고딕' 폰트 패밀리는 일반(Regular), 굵게(Bold), 기울임(Italic) 등 여러 스타일을 포함할 수 있다.
# 반면, 폰트 파일(.ttf, .otf 등)은 이러한 폰트의 하나의 스타일이 저장된 실제 파일이다.
#
# 이 프로젝트에서는 폰트 용량을 줄이기 위해 일반(Regular) 인 NanumBarunGothic.ttf 만 사용한다.

In [3]:
# 프로젝트 root 를 sys.path 에 추가해서 import 구문을 사용하기 쉽게
from pathlib import Path


def find_project_root() -> Path:
    """
    pyproject.toml 파일을 기준으로 루트 디렉토리를 찾는다.
    :return: Path: 프로젝트 루트 디렉토리 경로
    """

    current_path = Path().resolve()

    while current_path != current_path.parent:
        if (current_path / ROOT_MARKER).exists():
            return current_path

        current_path = current_path.parent

    raise FileNotFoundError("프로젝트 루트 디렉토리를 찾을 수 없습니다.")


ROOT_DIR = find_project_root()

In [4]:
# matplotlib 의 한글 font 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt


FONTS_DATA_DIR = ROOT_DIR / "notebooks" / "fonts"


def setup_korean_font():
    font_path = FONTS_DATA_DIR / KOREAN_FONT_FILE
    fm.fontManager.addfont(font_path)

    # 폰트 설정
    plt.rcParams["font.family"] = KOREAN_FONT_FAMILY
    plt.rcParams["axes.unicode_minus"] = False


setup_korean_font()

# Data Pipeline Sample

In [5]:
import pandas as pd

df = pd.read_csv("../weather_data1.csv")
print(df.head(3))
print(df['stnNm'].unique())
print(df.columns)

  df = pd.read_csv("../weather_data1.csv")


   stnId stnNm          tm  avgTa  minTa  minTaHrmt  maxTa  maxTaHrmt  \
0     90    속초  2020-01-01    0.8   -3.4       24.0    4.8     1330.0   
1     90    속초  2020-01-02    2.8   -2.1       37.0    6.6     1428.0   
2     90    속초  2020-01-03    4.7    0.8      638.0    8.5     1256.0   

   mi10MaxRn  mi10MaxRnHrmt  ...  avgM05Te  avgM10Te  avgM15Te  avgM30Te  \
0        NaN            NaN  ...       NaN       NaN       NaN       NaN   
1        NaN            NaN  ...       NaN       NaN       NaN       NaN   
2        NaN            NaN  ...       NaN       NaN       NaN       NaN   

   avgM50Te  sumLrgEv  sumSmlEv  n99Rn  iscs  sumFogDur  
0       NaN       NaN       NaN    NaN   NaN        NaN  
1       NaN       NaN       NaN    NaN   NaN        NaN  
2       NaN       NaN       NaN    NaN   NaN        NaN  

[3 rows x 62 columns]
['속초' '북춘천' '철원' '동두천' '파주' '대관령' '춘천' '백령도' '북강릉' '강릉' '동해' '서울' '인천'
 '원주' '울릉도' '수원' '영월' '충주' '서산' '울진' '청주' '대전' '추풍령' '안동' '상주' '포항' '군산'
 '대

## 지점별 데이터를 한 개의 DataFrame 로 병합하기

In [6]:
total_df = df.copy()
total_df

Unnamed: 0,stnId,stnNm,tm,avgTa,minTa,minTaHrmt,maxTa,maxTaHrmt,mi10MaxRn,mi10MaxRnHrmt,...,avgM05Te,avgM10Te,avgM15Te,avgM30Te,avgM50Te,sumLrgEv,sumSmlEv,n99Rn,iscs,sumFogDur
0,90,속초,2020-01-01,0.8,-3.4,24.0,4.8,1330.0,,,...,,,,,,,,,,
1,90,속초,2020-01-02,2.8,-2.1,37.0,6.6,1428.0,,,...,,,,,,,,,,
2,90,속초,2020-01-03,4.7,0.8,638.0,8.5,1256.0,,,...,,,,,,,,,,
3,90,속초,2020-01-04,4.0,1.1,2400.0,8.1,1353.0,,,...,,,,,,,,,,
4,90,속초,2020-01-05,3.1,-0.6,547.0,8.8,1350.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99578,235,보령,2024-12-27,0.4,-2.1,739.0,3.8,1554.0,,,...,,,,,,0.9,1.3,,,
99579,235,보령,2024-12-28,-0.5,-2.8,750.0,1.8,1552.0,,,...,,,,,,0.7,1.0,,,
99580,235,보령,2024-12-29,3.5,-0.7,742.0,8.2,1346.0,,,...,,,,,,1.4,1.9,,,
99581,235,보령,2024-12-30,7.9,4.8,20.0,11.5,1418.0,,,...,,,,,,1.4,2.0,,,


## 결측치 처리기

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

# WeatherDataImputer 클래스 정의
class WeatherDataImputer(BaseEstimator, TransformerMixin):
    """기상 데이터의 특정 column 에 대해서 결측치를 처리하는 class"""

    GROUP_BY_COLUMN = "stn_id"

    def __init__(self):
        self.interpolate_cols = [
            "avg_ta", "min_ta", "max_ta", "avg_ts", "avg_ps", "avg_rhm", "avg_pv",
            "avg_ws", "avg_tca", "sum_ss_hr", "avg_lmac", "max_ws", "avg_td",
            "avg_pa", "ss_dur", "sum_ss_hr"
        ]

        self.fillzero_cols = [
            "sum_rn", "mi10_max_rn", "mi10_max_rn_hrmt", "hr1_max_rn", "hr1_max_rn_hrmt"
        ]

        self.cols_to_drop = [
            "avg_cm10_te", "avg_cm20_te", "avg_cm30_te", "avg_cm5_te", "avg_m05_te",
            "avg_m10_te", "avg_m15_te", "avg_m30_te", "avg_m50_te", "dd_mefs",
            "dd_mefs_hrmt", "dd_mes", "dd_mes_hrmt", "hr1_max_icsr", "hr1_max_icsr_hrmt",
            "iscs", "n99_rn", "sum_dpth_fhsc", "sum_fog_dur", "sum_gsr", "sum_lrg_ev",
            "sum_rn_dur", "sum_sml_ev"
        ]

    def fit(self, x, y=None):
        return self

    def transform(self, x, y=None):
        result = total_df.copy()

        # drop: 없는 컬럼은 무시
        result = result.drop(self.cols_to_drop, axis=1, errors='ignore')

        # fillna(0): 존재하는 컬럼만
        for col in self.fillzero_cols:
            if col in result.columns:
                result[col] = result[col].fillna(0)

        # interpolate: 존재하는 컬럼만
        for column in self.interpolate_cols:
            if column in result.columns:
                result = self._fill_single_column_grouped(result, column)

        return result

    def fit_transform(self, x, y=None, **kwargs):
        return self.fit(x).transform(x)

    def _fill_single_column_grouped(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
        result = df.copy()
        for _, group_df in result.groupby(self.GROUP_BY_COLUMN):
            group_df_sorted = group_df.sort_values("tm")
            group_df_sorted[col] = group_df_sorted[col].interpolate(method="linear")
            group_df_sorted[col] = group_df_sorted[col].bfill()
            group_df_sorted[col] = group_df_sorted[col].ffill()
            result.loc[group_df_sorted.index, col] = group_df_sorted[col]
        return result


# ✅ 결측치 처리 수행
imputer = WeatherDataImputer()
filled_df = imputer.transform(total_df)

# ✅ 결과 출력
print(filled_df)

       stnId stnNm          tm  avgTa  minTa  minTaHrmt  maxTa  maxTaHrmt  \
0         90    속초  2020-01-01    0.8   -3.4       24.0    4.8     1330.0   
1         90    속초  2020-01-02    2.8   -2.1       37.0    6.6     1428.0   
2         90    속초  2020-01-03    4.7    0.8      638.0    8.5     1256.0   
3         90    속초  2020-01-04    4.0    1.1     2400.0    8.1     1353.0   
4         90    속초  2020-01-05    3.1   -0.6      547.0    8.8     1350.0   
...      ...   ...         ...    ...    ...        ...    ...        ...   
99578    235    보령  2024-12-27    0.4   -2.1      739.0    3.8     1554.0   
99579    235    보령  2024-12-28   -0.5   -2.8      750.0    1.8     1552.0   
99580    235    보령  2024-12-29    3.5   -0.7      742.0    8.2     1346.0   
99581    235    보령  2024-12-30    7.9    4.8       20.0   11.5     1418.0   
99582    235    보령  2024-12-31    2.1   -2.7     2353.0    6.6     1416.0   

       mi10MaxRn  mi10MaxRnHrmt  ...  avgCm30Te  avgM05Te  avgM10Te  avgM15

## 날시 label(맑음, 흐림, 비, 눈) 을 추가

In [8]:
from src.data.labeler import WeatherLabeler


labeler = WeatherLabeler()
labeled_df = labeler.fit_transform(filled_df)
labeled_df

Unnamed: 0,stnId,stnNm,tm,avgTa,minTa,minTaHrmt,maxTa,maxTaHrmt,mi10MaxRn,mi10MaxRnHrmt,...,avgM05Te,avgM10Te,avgM15Te,avgM30Te,avgM50Te,sumLrgEv,sumSmlEv,n99Rn,sumFogDur,weather
0,90,속초,2020-01-01,0.8,-3.4,24.0,4.8,1330.0,,,...,,,,,,,,,,알수없음
1,90,속초,2020-01-02,2.8,-2.1,37.0,6.6,1428.0,,,...,,,,,,,,,,알수없음
2,90,속초,2020-01-03,4.7,0.8,638.0,8.5,1256.0,,,...,,,,,,,,,,알수없음
3,90,속초,2020-01-04,4.0,1.1,2400.0,8.1,1353.0,,,...,,,,,,,,,,알수없음
4,90,속초,2020-01-05,3.1,-0.6,547.0,8.8,1350.0,,,...,,,,,,,,,,알수없음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99578,235,보령,2024-12-27,0.4,-2.1,739.0,3.8,1554.0,,,...,,,,,,0.9,1.3,,,알수없음
99579,235,보령,2024-12-28,-0.5,-2.8,750.0,1.8,1552.0,,,...,,,,,,0.7,1.0,,,알수없음
99580,235,보령,2024-12-29,3.5,-0.7,742.0,8.2,1346.0,,,...,,,,,,1.4,1.9,,,알수없음
99581,235,보령,2024-12-30,7.9,4.8,20.0,11.5,1418.0,,,...,,,,,,1.4,2.0,,,알수없음


## 이상치 처리

In [9]:
from src.data.handler import WeatherDataOutlierHandler


outlier_handler = WeatherDataOutlierHandler()
processed_df = outlier_handler.fit_transform(labeled_df)
processed_df

Unnamed: 0,stnId,stnNm,tm,avgTa,minTa,minTaHrmt,maxTa,maxTaHrmt,mi10MaxRn,mi10MaxRnHrmt,...,avgM05Te,avgM10Te,avgM15Te,avgM30Te,avgM50Te,sumLrgEv,sumSmlEv,n99Rn,sumFogDur,weather
0,90,속초,2020-01-01,0.8,-3.4,24.00,4.8,1330.0,,,...,,,,,,,,,,알수없음
1,90,속초,2020-01-02,2.8,-2.1,37.00,6.6,1428.0,,,...,,,,,,,,,,알수없음
2,90,속초,2020-01-03,4.7,0.8,638.00,8.5,1256.0,,,...,,,,,,,,,,알수없음
3,90,속초,2020-01-04,4.0,1.1,1406.75,8.1,1353.0,,,...,,,,,,,,,,알수없음
4,90,속초,2020-01-05,3.1,-0.6,547.00,8.8,1350.0,,,...,,,,,,,,,,알수없음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99578,235,보령,2024-12-27,0.4,-2.1,739.00,3.8,1554.0,,,...,,,,,,0.9,1.3,,,알수없음
99579,235,보령,2024-12-28,-0.5,-2.8,750.00,1.8,1552.0,,,...,,,,,,0.7,1.0,,,알수없음
99580,235,보령,2024-12-29,3.5,-0.7,742.00,8.2,1346.0,,,...,,,,,,1.4,1.9,,,알수없음
99581,235,보령,2024-12-30,7.9,4.8,20.00,11.5,1418.0,,,...,,,,,,1.4,2.0,,,알수없음


## Scaling 및 Encoding

In [10]:
from src.data.transformer import WeatherDataTransformer


transformer = WeatherDataTransformer()
features = transformer.fit_transform(processed_df)
features

Unnamed: 0,stnId,stnNm,tm,avgTa,minTa,minTaHrmt,maxTa,maxTaHrmt,mi10MaxRn,mi10MaxRnHrmt,...,avgM05Te,avgM10Te,avgM15Te,avgM30Te,avgM50Te,sumLrgEv,sumSmlEv,n99Rn,sumFogDur,weather
0,90,속초,20200101,0.8,-3.4,24.00,4.8,1330.0,,,...,,,,,,,,,,0
1,90,속초,20200102,2.8,-2.1,37.00,6.6,1428.0,,,...,,,,,,,,,,0
2,90,속초,20200103,4.7,0.8,638.00,8.5,1256.0,,,...,,,,,,,,,,0
3,90,속초,20200104,4.0,1.1,1406.75,8.1,1353.0,,,...,,,,,,,,,,0
4,90,속초,20200105,3.1,-0.6,547.00,8.8,1350.0,,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99578,235,보령,20241227,0.4,-2.1,739.00,3.8,1554.0,,,...,,,,,,0.9,1.3,,,0
99579,235,보령,20241228,-0.5,-2.8,750.00,1.8,1552.0,,,...,,,,,,0.7,1.0,,,0
99580,235,보령,20241229,3.5,-0.7,742.00,8.2,1346.0,,,...,,,,,,1.4,1.9,,,0
99581,235,보령,20241230,7.9,4.8,20.00,11.5,1418.0,,,...,,,,,,1.4,2.0,,,0


In [11]:
features.to_csv("prepared_weather_df.csv", index=False)