In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import argparse
import yaml
import logging
from typing import Dict, List, Tuple, Any


In [2]:
from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.models.model_factory import ModelFactory
from src.training.trainer import ClusterTrainer
from src.evaluation.evaluator import ModelEvaluator
from src.utils.logger import setup_logging
from src.utils.config import Config
from src.utils.file_manager import FileManager

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [3]:
config = Config("config/config.yaml")
config

<src.utils.config.Config at 0x1664a3b0250>

In [4]:
data_loader = DataLoader(config)

# train_df, test_df, submission_df = self._load_data()
train_df, test_df, submission_df = data_loader.load()

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)
print("Submission DataFrame shape:", submission_df.shape)

print(train_df.head())

Train DataFrame shape: (499301, 11)
Test DataFrame shape: (10, 11)
Submission DataFrame shape: (0, 2)
           tm branch_id    ta     wd   ws  rn_day  rn_hr1    hm    si  ta_chi  \
1  2021010101         A -10.1   78.3  0.5     0.0     0.0  68.2 -99.0    -8.2   
2  2021010102         A -10.2   71.9  0.6     0.0     0.0  69.9 -99.0    -8.6   
3  2021010103         A -10.0  360.0  0.0     0.0     0.0  69.2 -99.0    -8.8   
4  2021010104         A  -9.3  155.9  0.5     0.0     0.0  65.0 -99.0    -8.9   
5  2021010105         A  -9.0   74.3  1.9     0.0     0.0  63.5 -99.0    -9.2   

   heat_demand  
1          281  
2          262  
3          266  
4          285  
5          283  


In [5]:
# train_df, test_df = self._add_cluster_ids(train_df, test_df)

branch_to_cluster = {}
for cluster_id, branches in config.cluster.mapping.items():
    for branch in branches:
        branch_to_cluster[branch] = cluster_id

train_df['cluster_id'] = train_df['branch_id'].map(branch_to_cluster)
test_df['cluster_id'] = test_df['branch_id'].map(branch_to_cluster)

unmapped_train = train_df[train_df['cluster_id'].isna()]
unmapped_test = test_df[test_df['cluster_id'].isna()]

print(train_df.head())
print(unmapped_train)
print(unmapped_test)

           tm branch_id    ta     wd   ws  rn_day  rn_hr1    hm    si  ta_chi  \
1  2021010101         A -10.1   78.3  0.5     0.0     0.0  68.2 -99.0    -8.2   
2  2021010102         A -10.2   71.9  0.6     0.0     0.0  69.9 -99.0    -8.6   
3  2021010103         A -10.0  360.0  0.0     0.0     0.0  69.2 -99.0    -8.8   
4  2021010104         A  -9.3  155.9  0.5     0.0     0.0  65.0 -99.0    -8.9   
5  2021010105         A  -9.0   74.3  1.9     0.0     0.0  63.5 -99.0    -9.2   

   heat_demand  cluster_id  
1          281           2  
2          262           2  
3          266           2  
4          285           2  
5          283           2  
Empty DataFrame
Columns: [tm, branch_id, ta, wd, ws, rn_day, rn_hr1, hm, si, ta_chi, heat_demand, cluster_id]
Index: []
Empty DataFrame
Columns: [tm, branch_id, ta, wd, ws, rn_day, rn_hr1, hm, si, ta_chi, heat_demand, cluster_id]
Index: []


In [6]:
# _train_clusters(train_df, test_df, selected_clusters, predict=predict)

In [7]:
all_valid_true = []
all_valid_pred = []

cluster_id = 2 # 클래스터 2에 대한 것

train_cluster = train_df[train_df['cluster_id'] == cluster_id].copy()
test_cluster = test_df[test_df['cluster_id'] == cluster_id].copy()


# cluster_trainer = ClusterTrainer(config, cluster_id, 1234)
experiment_id = 1234
preprocessor = DataPreprocessor(config)
model_factory = ModelFactory(config)
evaluator = ModelEvaluator(config)
file_manager = FileManager(config)

models = {}
predictions = {}
metrics = {}

# result = cluster_trainer.train_and_predict(train_cluster, test_cluster, predict=predict)
# processed_data = self._preprocess_data(train_data, test_data)

train_data = train_df.replace(-99, np.nan).dropna(subset=[config.data.target_column])
test_data = test_df.replace(-99, np.nan).dropna(subset=[config.data.target_column])

from sklearn.model_selection import train_test_split
target_col = config.data.target_column
X = train_data.drop(columns=[target_col])
y = train_data[target_col]

x_train, x_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=config.split.test_size,
    random_state=config.split.random_state
)

split_data = {
    'x_train': x_train,
    'y_train': y_train,
    'x_valid': x_valid,
    'y_valid': y_valid,
    'x_full': train_data.drop(columns=[target_col]),
    'y_full': train_data[target_col]
}

test_features = test_data.drop(columns=["target"], errors="ignore")
test_keys = test_data[["tm", "branch_id"]].copy()

print(f"split_data['x_train'] \n{split_data['x_train'].head()}")
print(f"split_data['y_train'] \n{split_data['y_train'].head()}")
print(f"split_data['x_valid'] \n{split_data['x_valid'].head()}")
print(f"split_data['y_valid'] \n{split_data['y_valid'].head()}")

split_data['x_train'] 
                tm branch_id    ta     wd   ws  rn_day  rn_hr1    hm    si  \
265654  2021043008         K   8.8  130.7  0.1    19.5     0.0  91.1  0.29   
129140  2023092900         E  18.4    NaN  NaN     0.0     0.0  91.7   NaN   
382048  2022081306         O  24.7    9.5  0.5     0.0     0.0  84.3  0.00   
215258  2021072910         I  27.9  307.2  0.8     0.0     0.0  68.8  1.47   
79148   2021011323         D   1.7  290.3  0.4     0.0     0.0   NaN   NaN   

        ta_chi  cluster_id  
265654     8.8           0  
129140    21.2           0  
382048    26.8           0  
215258    31.0           0  
79148      1.1           2  
split_data['y_train'] 
265654     33.0
129140     13.0
382048      9.0
215258     48.0
79148     378.0
Name: heat_demand, dtype: float64
split_data['x_valid'] 
                tm branch_id    ta     wd   ws  rn_day  rn_hr1    hm    si  \
385273  2022122515         O   0.7  287.7  2.4     0.0     0.0  51.2  1.65   
122365  2022122017

In [8]:
print("split_data['x_train'] 컬럼 목록:", split_data['x_train'].columns.tolist())


split_data['x_train'] 컬럼 목록: ['tm', 'branch_id', 'ta', 'wd', 'ws', 'rn_day', 'rn_hr1', 'hm', 'si', 'ta_chi', 'cluster_id']


In [10]:
processed_train = preprocessor.fit_transform(split_data['x_train'])
processed_valid = preprocessor.transform(split_data['x_valid'])

✅ wd 이동평균 계산 완료
✅ ws 이동평균 계산 완료
✅ rn_hr1 이동평균 계산 완료
✅ rn_day 이동평균 계산 완료
(99856, 12)
✅ wd 이동평균 계산 완료
✅ ws 이동평균 계산 완료
✅ rn_hr1 이동평균 계산 완료
✅ rn_day 이동평균 계산 완료
(99856, 40)
(99856, 40)
(99856, 40)


In [13]:
print(processed_train.shape)
print(processed_valid.shape)

(399422, 47)
(99856, 47)


In [15]:
processed_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
branch_id,399422.0,9.001502,5.477988,0.0,4.0,9.0,14.0,18.0
ta,389040.0,13.544972,10.858695,-19.7,5.2,14.7,22.7,37.8
wd,383059.0,206.603506,104.619329,0.0,119.5,221.7,297.5,360.0
ws,384305.0,1.440735,1.18645,0.0,0.5,1.2,2.1,11.9
rn_day,384532.0,1.927976,9.697678,0.0,0.0,0.0,0.0,326.5
rn_hr1,384107.0,0.154593,1.234645,0.0,0.0,0.0,0.0,92.5
hm,367678.0,66.156798,21.143092,2.5,50.3,67.6,84.0,100.0
si,213055.0,1.091673,0.972342,0.0,0.21,0.85,1.82,3.92
ta_chi,399410.0,13.869262,11.871483,-24.5,4.4,15.0,24.3,37.0
cluster_id,399422.0,1.210454,1.195771,0.0,0.0,1.0,2.0,3.0


In [16]:
processed_train.isna().sum().sort_values(ascending=False)

si_lag_25        186632
si_lag_24        186622
si_lag_2         186383
si_lag_1         186376
si               186367
hm_lag_25         32189
hm_lag_24         32170
hm_lag_2          31780
dew_point         31773
hm_lag_1          31763
hm                31744
wd_ma             19688
ws_ma             18374
rn_hr1_ma         16998
rn_day_ma         16417
wd                16363
rn_hr1            15315
ws                15117
rn_day            14890
ta_lag_25         10844
ta_lag_24         10825
ta_lag_2          10420
ta_lag_1          10401
ta                10382
ta_chi_lag_25       487
ta_chi_lag_24       468
ta_chi_lag_2         50
ta_chi_lag_1         31
ta_chi               12
branch_id             0
cluster_id            0
weekday               0
hour                  0
day                   0
year                  0
month                 0
season                0
wd_group              0
ws_group              0
hour_of_week          0
day_of_month          0
hour_sin        