In [None]:
import os
import operator
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from math import sqrt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import warnings
warnings.filterwarnings(action='ignore')

# 시각화
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go
plt.style.use(["seaborn-v0_8-muted"])
plt.rc('font', family='Malgun Gothic')
matplotlib.rc('axes',unicode_minus=False)
seaborn_style = [style for style in matplotlib.style.available if "seaborn" in style]
from tqdm import tqdm

# 모델
from pycaret.regression import *
from pycaret.regression import load_model
import joblib

from sklearnex import patch_sklearn
patch_sklearn()
from lightgbm import plot_importance
from lightgbm.sklearn import LGBMRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest

import optuna
from optuna.samplers import TPESampler
from functools import partial

In [None]:
import torch 
### 토치확인
print("pytorch cuda 여부")
print(torch.cuda.is_available())
print('#'*50)

from tensorflow.python.client import device_lib
print("tensorflow cuda 여부")
print(device_lib.list_local_devices())
print('#'*50)

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" # -1이면 cpu사용

# 모델링

In [None]:
ppd_data=pd.read_csv('../data/ppd_data.csv')
ppd_data.set_index("ymdhm",inplace=True)
ppd_data.index=pd.to_datetime(ppd_data.index)

answer=pd.read_csv("../data/2023_js_answer.csv")
answer['ymdhm']=pd.to_datetime(answer['ymdhm'])
answer.set_index('ymdhm',inplace=True)

# Baseline model

트리기반의 모델을 이용하여 rmse와 시간을 비교하여 베이스라인 모델을 선정.<br>
Pycaret을 이용하여 간단하게 비교.

In [None]:
# tmp=ppd_data.copy()

# TARGET_BRIDGE = 'wl_jamsu'
# tmp["prev_"+TARGET_BRIDGE]=tmp[TARGET_BRIDGE].copy()
# X_train = tmp.drop(columns=[TARGET_BRIDGE]).shift(1)
# X_train = X_train.iloc[1:]
# y_train = tmp[TARGET_BRIDGE].iloc[1:]
# train=pd.concat([X_train,y_train],axis=1)

# # 모델을 선정하기위해 automl pycaret을 이용하여 선정. -> 한대교만을 기준으로 선정하였음. 
# baseline = setup(data=train, target=TARGET_BRIDGE, fold_strategy = 'kfold',fold=5, data_split_shuffle=True, use_gpu=True, verbose=False)
# baseline_res = compare_models(include = ['lightgbm','xgboost','catboost','rf'], sort='rmse')
# baseline_res

# model train & tune & inference

In [None]:
# 해당 폴더에 파일이 있는지 없는지 확인
def check_file(path):
    return os.path.isfile(path)
# 평가지수 rmse/r_squared
def metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r_squared = r2_score(y_true, y_pred)
    score=rmse/r_squared
    return 'score', score, False

In [None]:
def feature_importance(leadtime,month, max_n):
    """
    lgbm 모델의 feature importance를 불러오는 함수.
    """
    fig=plt.figure(figsize=(10,6))
    if(isinstance(month,int)):prefix=f"month={month}"
    else:prefix=month
    model = joblib.load(f'../model/{TARGET_BRIDGE}/leadtime({leadtime})_{PARAMS_SIZE}/tuned_model({prefix}).pkl')
    importance = plot_importance(model, max_num_features=max_n, figsize=(6, 4))
    plt.title(f'month={month}, leadtime={leadtime} feature importance')
    return fig

def plot_compare(answer,data_combinations):
    """
    여러 조합의 실제 데이터와 예측 데이터를 비교하여 행으로 시각화해주는 함수
    1. scatter plot /  2. line plot
    """
    num_combinations = len(data_combinations)

    fig, axs = plt.subplots(num_combinations, 2, figsize=(20, 7*num_combinations), sharey=True,gridspec_kw={'width_ratios': [1, 2]})
    titles=['Parameter:S','Parameter:M','Parameter:L']

    for i, pred_data in enumerate(data_combinations):
        # 오차(rmse) 계산
        rmse = mean_squared_error(answer[TARGET_BRIDGE], pred_data['pred_target'], squared=False)

        # 서브플롯 설정
        ax1 = axs[i, 0] if num_combinations > 1 else axs[0]
        ax2 = axs[i, 1] if num_combinations > 1 else axs[1]

        # Scatter plot
        ax1.scatter(answer[TARGET_BRIDGE], pred_data['pred_target'])
        ax1.set_xlabel('Time')
        ax1.set_ylabel('Water Level')
        ax1.set_title(f'{titles[i]}')
        ax1.set_aspect('equal', adjustable='box')

        # Line plot
        ax2.text(0.05, 0.95, '  {:.3f}  '.format(rmse), fontsize=15, ha='right', va='top', transform=ax2.transAxes)
        ax2.plot(answer.index, answer[TARGET_BRIDGE], label='Actual', color='blue', alpha=0.5)
        ax2.plot(answer.index, pred_data['pred_target'], label='Predicted', color='red', linestyle='--', alpha=0.5)
        #ax2.set_xlabel('Time')
        #ax2.set_ylabel('Water Level')
        ax2.legend()
        ax2.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_month(answer_data,pred_data,len_m):
    """
    1. scatter plot: 실제 데이터와 예측 데이터의 오차를 월별로 보여주는 함수
    2. bar plot: 예측 데이터의 오차를 월별로 보여주는 함수 
    
    """
    used_data=pd.concat([answer_data,pred_data],axis=1)
    used_data.index=pd.to_datetime(used_data.index)

    grouped = used_data.groupby(used_data.index.month)
    year = used_data.index[0].year

    fig, axes = plt.subplots(nrows=len_m//2, ncols=2, figsize=(20, len_m*2))
    axes = axes.ravel()  
    month_names = ["January", "February", "March", "April", "May", "June", "July", "August", 
                "September", "October", "November", "December"]

    month_rmse = []
    
    # Line plot
    for (month, m_data), ax in zip(grouped, axes):
        actual = m_data[TARGET_BRIDGE]
        predicted = m_data['pred_target']
        
        rmse = mean_squared_error(actual, predicted, squared=False)
        month_rmse.append(rmse)
        
        ax.plot(m_data.index, actual, label='Actual', color='blue', alpha=0.5)
        ax.plot(m_data.index, predicted, label='Predicted', color='red', linestyle='--', alpha=0.5)
        ax.set_title(f'{TARGET_BRIDGE}{month_names[month-1]}, {year} - RMSE: {rmse:.2f}')
        ax.set_xlabel('Time')
        ax.set_ylabel('Water Level')
        ax.legend()
        ax.grid(True)
    plt.tight_layout()
    plt.title(f'{TARGET_BRIDGE} PREDICT')
    plt.show();
    
    # Bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(month_rmse)), month_rmse, color='skyblue', edgecolor='black')
    plt.ylabel('RMSE')
    plt.xlabel('Month')
    plt.title(f'RMSE by Month for {year}')
    plt.xticks(range(len(month_rmse)), [month_names[i-1] for i in grouped.groups.keys()], rotation=45)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show();
    
# 3개의 모델(월별, 기간별, 전체)을 이용하여 test 셋의 rmse를 가장 잘 예측한 모델을 확인.
def finalize_modeling(dataset,leadtime,bridge,Params):
    overall_rmse_list = []
    print("#" * 50)
    print(f"{bridge} Predict report for Leadtime {leadtime}-{Params}:\n")
    rmse_best = pd.DataFrame(index=range(1, 10), columns=['Model 1', 'Model 2', 'Model 3', 'Best Model'])
    for month in range(1, 10):
        best_rmse, best_model_name = float('inf'), None
        for i, model in enumerate(dataset):
            model.index = pd.to_datetime(model.index)
            model_month = model[model.index.month == month]
            answer_month = answer[answer.index.month == month]['wl_'+bridge]

            if not model_month.empty and not answer_month.empty:
                rmse = mean_squared_error(answer_month, model_month,squared=False)
                rmse_best.loc[month, f'Model {i+1}'] = rmse

                if rmse < best_rmse:
                    best_rmse, best_model_name = rmse, f'Model {i+1}'

        rmse_best.loc[month, 'Best Model'] = best_model_name

    best_model_rmse = [rmse_best.loc[month, best_model] for month, best_model in enumerate(rmse_best['Best Model'], start=1)]
    overall_rmse = np.sqrt(np.mean(np.square(best_model_rmse)))
    overall_rmse_list.append(overall_rmse)
    rmse_best.columns=['Monthly','Flood','Total','Best Model']
    
    return rmse_best,overall_rmse


In [None]:
def prepare_datasets(input_data, month_list):
    """
    Parameters:
    - input_data: 2023 제외한 특정 month 데이터 
    - leadtime: 선행시간
    - month_list: 학습하고자하는 month 

    Returns:
    - X_train: 학습 데이터 features
    - y_train: 학습 데이터 target
    """
    
    X_train = input_data.drop(columns=[TARGET_BRIDGE]).shift(LEADTIME//10)

    y_train = input_data[TARGET_BRIDGE].iloc[LEADTIME//10:]

    X_train = X_train.iloc[LEADTIME//10:]
    
    X_train=X_train[X_train['month'].isin(month_list)]
    y_train=y_train[X_train.index]

    return X_train, y_train

In [None]:
def make_base_model(train_data):
    """
    Parameters:
    - train_data: 학습 데이터 
    - leadtime: 선행시간

    Returns:
    - model: 베이스라인 모델(성능비교용)
    """
    if(MODE=='train'):
        params={
            "num_leaves" : 1023,
            "max_depth" : 15,
            "learning_rate" : 0.23,
            "n_estimators": 50,
            "min_child_samples" :10,
            "reg_lambda" : 7,
            "colsample_bytree" : 0.5,
        }
    else:
        prev_model = joblib.load(f'../model/{TARGET_BRIDGE}/leadtime(10)_{PARAMS_SIZE}/tuned_model({PRIFIX}).pkl')
        params = prev_model.get_params()
        model = LGBMRegressor(**params)
        model.fit(train_data[0], train_data[1])
        return model
        
    X_train, X_valid, y_train, y_valid = train_test_split(train_data[0], train_data[1], test_size=0.2, random_state=624)

    model = LGBMRegressor(**params,random_state=624,objective= 'rmse',device='gpu',verbose=-1)
    model.fit(X_train,y_train, eval_set=[(X_valid, y_valid)], eval_metric=metrics)
    
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
    print(f'Valid Score: {metrics(y_valid, y_pred)[1]}')
    
    return model

In [None]:
class EarlyStoppingCallback(object):
    # stackoverflow에서 가져옴. https://github.com/optuna/optuna/issues/1001
    # optuna의 early stopping callback 적용
    # 평가지수가 감소하는 방향으로 학습하면 minimize
    # 평가지수가 증가하는 방향으로 학습하면 maximize

    def __init__(self, early_stopping_rounds: int, direction: str = "minimize") -> None:
        self.early_stopping_rounds = early_stopping_rounds
        self._iter = 0
        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study: optuna.Study, trial: optuna.Trial) -> None:
        """Do early stopping."""
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()
            
def weighted_mse(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        loss = np.where(residual > 0, alpha * residual**2, residual**2)
        return np.mean(loss)

    return weighted_mse_fixed

# optuna의 목적함수
def objective(trial, X, y):
    
    param_ranges = {
        'S': (10, 50, 3, 15, 0.01, 0.1, 50, 200, 5, 20, 0.1, 1.0, 0.6, 1.0),
        'M': (20, 70, 5, 20, 0.005, 0.1, 100, 300, 10, 30, 0.05, 0.5, 0.5, 1.0),
        'L': (30, 100, 8, 25, 0.001, 0.1, 150, 400, 15, 40, 0.01, 0.3, 0.4, 1.0),
        # NUM_LEAVES,MAX_DEPTH,LR,N_ESTIMATORS,MIN_CHILD_SAMPLES,REG_LAMBDA,COLSAMPLE_BYTREE,DEVIDE
    }

    params_optuna = {
        "num_leaves": trial.suggest_int('num_leaves', *param_ranges[PARAMS_SIZE][:2]),
        "max_depth": trial.suggest_int('max_depth', *param_ranges[PARAMS_SIZE][2:4]),
        "learning_rate": trial.suggest_float('learning_rate', *param_ranges[PARAMS_SIZE][4:6]),
        "n_estimators": trial.suggest_int('n_estimators', *param_ranges[PARAMS_SIZE][6:8]),
        "min_child_samples": trial.suggest_int('min_child_samples', *param_ranges[PARAMS_SIZE][8:10]),
        "reg_lambda": trial.suggest_float('reg_lambda', *param_ranges[PARAMS_SIZE][10:12]),
        "colsample_bytree": trial.suggest_float('colsample_bytree', *param_ranges[PARAMS_SIZE][12:]),
        "device": 'gpu',
    }

    model = LGBMRegressor(**params_optuna,verbose=-1)

    folds = KFold(n_splits=5, random_state=624, shuffle=True)
    losses = []

    for train_idx, valid_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]

        X_valid = X.iloc[valid_idx, :]
        y_valid = y.iloc[valid_idx]

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=metrics)
        preds = model.predict(X_valid)
        
        loss =  weighted_mse()(y_valid, preds)
        losses.append(loss)
        
    return np.mean(losses)

def make_tune_model(train_data):
    # optuna에 인자를 넘기고 싶을 경우 partial 사용
    opt_func = partial(objective, X=train_data[0], y=train_data[1]) 

    K = 5 
    sampler = TPESampler(seed=624)
    
    study = optuna.create_study(direction="minimize", # 최소/최대 어느 방향의 최적값을 구할 건지.
                                sampler=sampler,
                                study_name=f"{EXP_NAME}",
                                storage=f"sqlite:///study_record/{EXP_NAME}.sqlite3",
                                load_if_exists=True) 
    early_stopping = EarlyStoppingCallback(15, direction='minimize')
    study.optimize(opt_func, n_trials=EPOCH, callbacks=[early_stopping])
    
    print("Tuned train Score: %.4f" % study.best_value) # best score 출력
    print("Tuned params: ", study.best_trial.params) # best score일 때의 하이퍼파라미터들
    
    best_params = study.best_params
    best_model = LGBMRegressor(**best_params,random_state=624,objective= 'rmse',device='gpu', verbose=-1)
    best_model.fit(train_data[0], train_data[1])
    
    return best_model

In [None]:
def get_model_input(data):
    # 특정 시점의 데이터를 예측하기 위한, 학습 데이터 준비(선행시간별로 달라지기 때문) 
    start_time = datetime(2023, 1, 1) - timedelta(minutes=LEADTIME)
    
    input_data = data[data.index >= start_time.strftime('%Y-%m-%d %H:%M:%S')]
    
    input_data.drop([TARGET_BRIDGE],axis=1,inplace=True)
    
    return input_data

def make_pred(data, model, month_list):
    """
    각 모델로 2023년 전체를 예측한뒤, 각 모델에 맞는 특정월 결과만 가져옴.
    """
    input_data = get_model_input(data)
    y_pred = model.predict(input_data,verbose=0)
    y_pred=pd.DataFrame(y_pred[:-(LEADTIME//10)],index=answer.index,columns=['pred_target'])
    y_pred.index=pd.to_datetime(y_pred.index)
    y_pred['month']=y_pred.index.month
    y_pred=y_pred[y_pred['month'].isin(month_list)]
    
    return y_pred

In [None]:
def predict_wl_level(train_data,data,month_list):
    print("#"*50)
    print(f"{TARGET_BRIDGE} {PRIFIX}({LEADTIME}) predict start")
    monthly_train= prepare_datasets(train_data,month_list)
    if((check_file(f"../model/{TARGET_BRIDGE}/leadtime({LEADTIME})_{PARAMS_SIZE}/tuned_model({PRIFIX}).pkl")!=1) | (MODE=='test')):
        model=make_base_model(monthly_train)
        #joblib.dump(base_model, f'../model/leadtime({leadtime})/base_model(month={month}).pkl')
        if(MODE=='train'):model=make_tune_model(monthly_train)
        if(SAVE==True):joblib.dump(model, f"../model/{TARGET_BRIDGE}/leadtime({LEADTIME})_{PARAMS_SIZE}/tuned_model({PRIFIX}).pkl")
    pred=make_pred(data,model,month_list)
    if(SAVE==True):pred.to_csv(f'../result/{TARGET_BRIDGE}/leadtime({LEADTIME})_{PARAMS_SIZE}/(lt={LEADTIME},{PRIFIX})_predict.csv')
    print(f"{TARGET_BRIDGE} {PRIFIX}({LEADTIME}) predict end")
    return pred

def train(data,leadtime,model_num,params_size,mode,save=True):
    
    global EXP_NAME
    global PRIFIX
    global LEADTIME
    global MODEL_NUM
    global PARAMS_SIZE
    global MODE
    global SAVE
    
    LEADTIME,MODEL_NUM,PARAMS_SIZE,MODE,SAVE=leadtime,model_num,params_size,mode,save
    
    if(MODE=='train'):
        check_task=int(input(f"Check Option \n PARAMS_SIZE={PARAMS_SIZE}, MODEL_NUM={MODEL_NUM}, 선행시간={LEADTIME}분, 타겟={TARGET_BRIDGE},MODE={MODE} \n IF want continue enter 1 else 0"))
        if(check_task==0):
            print("Check the option")
            return 
    os.makedirs(f"../model/{TARGET_BRIDGE}/leadtime({LEADTIME})_{PARAMS_SIZE}",exist_ok=True)
    os.makedirs(f"../result/{TARGET_BRIDGE}/leadtime({LEADTIME})_{PARAMS_SIZE}",exist_ok=True)
    
    data["prev_"+TARGET_BRIDGE]=data[TARGET_BRIDGE].copy()
    train_data = data[:f"2023-01-01 00:00:00"]
    
    print("#"*100)
    print("#"*100)
    print(f"{TARGET_BRIDGE} Leadtime {LEADTIME} modeling start")
    
    f_name=[['0','1','2','3','4','5','6','7','8','9'],['비홍수기','홍수기'],'전체']
    tmp_result=[]
    # 각 월마다 따로 학습 및 병합
    if(MODEL_NUM==1):
        for month in range(1,10):
            EXP_NAME=f"{TARGET_BRIDGE}(lt={LEADTIME},m={month},PARAMS_SIZE={PARAMS_SIZE})"
            PRIFIX=f"month={f_name[0][month]}"
            pred=predict_wl_level(train_data,data,[month])
            tmp_result.append(pred)
    # 홍수기와 비홍수기 나눠서 학습 및 병합
    elif(MODEL_NUM==2):
        # 비홍수기
        month=[1,2,3,4,5,10,11,12]
        EXP_NAME=f"{TARGET_BRIDGE}(lt={LEADTIME},m=비홍수기,PARAMS_SIZE={PARAMS_SIZE})"
        PRIFIX=f_name[1][0]
        pred=predict_wl_level(train_data,data,month)
        tmp_result.append(pred)
        
        month=[6,7,8,9]
        EXP_NAME=f"{TARGET_BRIDGE}(lt={LEADTIME},m=홍수기,PARAMS_SIZE={PARAMS_SIZE})"
        PRIFIX=f_name[1][1]
        pred=predict_wl_level(train_data,data,month)
        tmp_result.append(pred)
    # 전체 월로 학습 
    elif(MODEL_NUM==3):
        month=[1,2,3,4,5,6,7,8,9,10,11,12]
        EXP_NAME=f"{TARGET_BRIDGE}(lt={LEADTIME},m=전체,PARAMS_SIZE={PARAMS_SIZE})"
        PRIFIX=f_name[2]
        pred=predict_wl_level(train_data,data,month)
        tmp_result.append(pred)
        
    final_result = pd.concat(tmp_result)
    final_result.drop(['month'],axis=1,inplace=True)
    
    return final_result 

## 잠수교 모델학습 및 추론(2023 1~9월)

In [None]:
TARGET_BRIDGE='wl_jamsu'
EPOCH=100

class LeadtimeModel:
    
    def __init__(self, data, leadtime, mode='test'):
        """
        data=input data
        leadtime=predict leadtime
        mode= 'train'이면 optuna 하이퍼파라미터 튜닝까지. 'test'면 저장된 최적 파라미터를 불러와서 학습
        """
        self.data=data
        self.leadtime = leadtime
        self.mode=mode
        # 모델이 존재하면 load
        try:
            self.model_S=self.load('S')
            self.model_M=self.load('M')
            self.model_L=self.load('L')
        # 모델이 존재하지 않으면 train
        except:
            self.train()
    # 각 파라미터 크기별로 3개의 모델(월별,기간별,전체)를 생성 --> 총 9개의 모델을 생성하는 train.
    # (선행시간10분)을 튜닝하면서 얻은 하이퍼파라미터를 이용하여 학습 진행     
    def train(self,save=True):
        """
        save= True면 모델 저장 및 csv 저장까지 False면 저장하지 않고 반환만.
        """
        self.model_S=self.train_model('S',save)
        self.model_M=self.train_model('M',save)
        self.model_L=self.train_model('L',save)
        self.score_S,self.best_score_S=self.show_score(self.model_S,'S')
        self.score_M,self.best_score_M=self.show_score(self.model_M,'M')
        self.score_L,self.best_score_L=self.show_score(self.model_L,'L')
    
    def train_model(self,params_size,save):
        monthly = train(self.data,leadtime=self.leadtime, model_num=1, params_size=params_size,mode=self.mode,save=save)
        flood = train(self.data,leadtime=self.leadtime, model_num=2, params_size=params_size,mode=self.mode,save=save)
        total = train(self.data,leadtime=self.leadtime, model_num=3, params_size=params_size,mode=self.mode,save=save)

        lt_list = [monthly, flood, total]

        return lt_list
    
    # 이미 결과가 존재할 경우 각 모델에 맞는 결과를 불러옴 
    def load(self,params_size):
        months=[]
        path=f"../result/wl_jamsu/leadtime({self.leadtime})"
        
        for month in range(1,10):
            tmp=pd.read_csv(f'{path}_{params_size}/(lt={self.leadtime},month={month})_predict.csv')
            tmp['ymdhm'] = pd.to_datetime(tmp['ymdhm'])
            tmp = tmp[tmp['ymdhm'].dt.month == month]   
            months.append(tmp)
        result_S = pd.concat(months,ignore_index=True)
        result_S.sort_values(by='ymdhm',inplace=True)
        result_S.drop(['month'],axis=1,inplace=True)
        
        tmp=pd.read_csv(f'{path}_{params_size}/(lt={self.leadtime},비홍수기)_predict.csv')
        tmp['ymdhm'] = pd.to_datetime(tmp['ymdhm'])
        tmp = tmp[tmp['ymdhm'].dt.month.isin([1,2,3,4,5])]
        tmp2=pd.read_csv(f'{path}_{params_size}/(lt={self.leadtime},홍수기)_predict.csv')
        tmp2['ymdhm'] = pd.to_datetime(tmp2['ymdhm'])
        tmp2 = tmp2[tmp2['ymdhm'].dt.month.isin([6,7,8,9])]
        result_M = pd.concat([tmp,tmp2])
        result_M.sort_values(by='ymdhm',inplace=True)
        result_M.drop(['month'],axis=1,inplace=True)
        
        result_L=pd.read_csv(f'{path}_{params_size}/(lt={self.leadtime},전체)_predict.csv')
        result_L['ymdhm'] = pd.to_datetime(result_L['ymdhm'])
        result_L.sort_values(by='ymdhm',inplace=True)
        result_L.drop(['month'],axis=1,inplace=True)
        
        return [result_S,result_M,result_L]
    
    
    def show_plot(self,model):
        plot_compare(answer,model)
        
    def show_score(self,model,params):
        result,best_score=finalize_modeling(model,leadtime=self.leadtime,bridge='jamsu',Params=params)
        return result,best_score

# 선행시간 10분

## 선행시간 10분 모델학습

In [None]:
leadtime10=LeadtimeModel(ppd_data,10,'test')

## 선행시간 10분 모델결과

In [None]:
### 선행시간 10분 모델 결과 PLOT
leadtime10.show_plot(leadtime10.model_S)
leadtime10.show_plot(leadtime10.model_M)
leadtime10.show_plot(leadtime10.model_L)

In [None]:
leadtime10.score_S
print(f"Best RMSE:{leadtime10.best_score_S}")

leadtime10.score_M
print(f"Best RMSE:{leadtime10.best_score_M}")

leadtime10.score_L
print(f"Best RMSE:{leadtime10.best_score_L}")

# 선행시간 60분

## 선행시간 60분 모델 학습

In [None]:
leadtime60=LeadtimeModel(ppd_data,60)

## 선행시간 60분 모델 결과

In [None]:
### 선행시간 60분 모델 결과 PLOT
leadtime60.show_plot(leadtime60.model_S)
leadtime60.show_plot(leadtime60.model_M)
leadtime60.show_plot(leadtime60.model_L)

In [None]:
leadtime60.score_S
print(f"Best RMSE:{leadtime60.best_score_S}")

leadtime60.score_M
print(f"Best RMSE:{leadtime60.best_score_M}")

leadtime60.score_L
print(f"Best RMSE:{leadtime60.best_score_L}")

# 선행시간 180분

## 선행시간 180분 모델 학습

In [None]:
leadtime180=LeadtimeModel(ppd_data,180)

## 선행시간 180분 모델 결과

In [None]:
### 선행시간 180분 모델 결과 PLOT
leadtime180.show_plot(leadtime180.model_S)
leadtime180.show_plot(leadtime180.model_M)
leadtime180.show_plot(leadtime180.model_L)

In [None]:
leadtime180.score_S
print(f"Best RMSE:{leadtime180.best_score_S}")

leadtime180.score_M
print(f"Best RMSE:{leadtime180.best_score_M}")

leadtime180.score_L
print(f"Best RMSE:{leadtime180.best_score_L}")

# 선행시간 360분

## 선행시간 360분 모델 학습

In [None]:
leadtime360=LeadtimeModel(ppd_data,360)

## 선행시간 360분 모델 결과

In [None]:
### 선행시간 360분 모델 결과 PLOT
leadtime360.show_plot(leadtime360.model_S)
leadtime360.show_plot(leadtime360.model_M)
leadtime360.show_plot(leadtime360.model_L)

In [None]:
leadtime360.score_S
print(f"Best RMSE:{leadtime360.best_score_S}")

leadtime360.score_M
print(f"Best RMSE:{leadtime360.best_score_M}")

leadtime360.score_L
print(f"Best RMSE:{leadtime360.best_score_L}")

# 선행시간 540분

## 선행시간 540분 모델 학습

In [None]:
leadtime540=LeadtimeModel(ppd_data,540)

In [None]:
### 선행시간 540분 모델 결과 PLOT
leadtime540.show_plot(leadtime540.model_S)
leadtime540.show_plot(leadtime540.model_M)
leadtime540.show_plot(leadtime540.model_L)

# 선행시간 720분

## 선행시간 720분 모델 학습

In [None]:
leadtime720=LeadtimeModel(ppd_data,720)

## 선행시간 720분 모델 결과

In [None]:
### 선행시간 720분 모델 결과 PLOT
leadtime720.show_plot(leadtime720.model_S)
leadtime720.show_plot(leadtime720.model_M)
leadtime720.show_plot(leadtime720.model_L)

In [None]:
leadtime720.score_S
print(f"Best RMSE:{leadtime720.best_score_S}")

leadtime720.score_M
print(f"Best RMSE:{leadtime720.best_score_M}")

leadtime720.score_L
print(f"Best RMSE:{leadtime720.best_score_L}")

# 선행시간 1440분

## 선행시간 1440분 모델 학습

In [None]:
leadtime1440=LeadtimeModel(ppd_data,1440)

## 선행시간 1440분 모델 결과

In [None]:
### 선행시간 1440분 모델 결과 PLOT
leadtime1440.show_plot(leadtime1440.model_S)
leadtime1440.show_plot(leadtime1440.model_M)
leadtime1440.show_plot(leadtime1440.model_L)

In [None]:
leadtime1440.score_S
print(f"Best RMSE:{leadtime1440.best_score_S}")

leadtime1440.score_M
print(f"Best RMSE:{leadtime1440.best_score_M}")

leadtime1440.score_L
print(f"Best RMSE:{leadtime1440.best_score_L}")

In [None]:
leadtime10_S=leadtime10.score_S
leadtime10_M=leadtime10.score_M
leadtime10_L=leadtime10.score_L

leadtime60_S=leadtime60.score_S
leadtime60_M=leadtime60.score_M
leadtime60_L=leadtime60.score_L

leadtime180_S=leadtime180.score_S
leadtime180_M=leadtime180.score_M
leadtime180_L=leadtime180.score_L

leadtime360_S=leadtime360.score_S
leadtime360_M=leadtime360.score_M
leadtime360_L=leadtime360.score_L

leadtime720_S=leadtime720.score_S
leadtime720_M=leadtime720.score_M
leadtime720_L=leadtime720.score_L

leadtime1440_S=leadtime1440.score_S
leadtime1440_M=leadtime1440.score_M
leadtime1440_L=leadtime1440.score_L

# 전체 선행시간 결과 비교

In [None]:
result=[]
leadtime=[10,60,180,360,720,1440]

for lt in leadtime:
    df_name = f'leadtime{lt}'
    
    globals()[df_name+"_S"]['name']=df_name+'_S'
    globals()[df_name+"_S"]['month']=globals()[df_name+"_S"].index
    globals()[df_name+"_M"]['name']=df_name+'_M'
    globals()[df_name+"_M"]['month']=globals()[df_name+"_M"].index
    globals()[df_name+"_L"]['name']=df_name+'_L'
    globals()[df_name+"_L"]['month']=globals()[df_name+"_L"].index
    
    result.extend([globals()[df_name+"_S"], globals()[df_name+"_M"], globals()[df_name+"_L"]])
result

In [None]:
total_result= pd.concat(result,ignore_index=True)
total_result=total_result[['Monthly','Flood','Total','Best Model','name','month']]
total_result

In [None]:
total_result.pivot_table(index=['month', 'Best Model'], aggfunc='size').unstack(fill_value=0)

# 추가 feature engineering

In [None]:
filtered_data=ppd_data[(ppd_data.index.year==2023) & (ppd_data['wl_jamsu']>500)][['wl_jamsu','rf_songjeong','rf_daegog','rf_zingwan']]

filtered_index = filtered_data.index

# 이전 3개의 행에 해당하는 인덱스 계산
previous_indices = filtered_index.union(filtered_index - pd.DateOffset(minutes=10))
previous_indices = previous_indices.union(filtered_index - pd.DateOffset(minutes=20))
previous_indices = previous_indices.union(filtered_index - pd.DateOffset(minutes=30))

# 이전 3개의 행 가져오기
previous_data = ppd_data.loc[previous_indices]

## 누적강수량 계산

In [None]:
ppd_data2=ppd_data.copy()
ppd_data2['rf_zingwan_sum'] =  ppd_data2['rf_zingwan'].rolling(window=144).sum()
ppd_data2['rf_daegog_sum'] = ppd_data2['rf_daegog'].rolling(window=144).sum()
ppd_data2['rf_songjeong_sum'] = ppd_data2['rf_songjeong'].rolling(window=144).sum()

In [None]:
ppd_data2['rf_zingwan_sum']= ppd_data2['rf_zingwan_sum'].fillna(ppd_data2['rf_zingwan'].cumsum())
ppd_data2['rf_daegog_sum']= ppd_data2['rf_daegog_sum'].fillna(ppd_data2['rf_daegog'].cumsum())
ppd_data2['rf_songjeong_sum']= ppd_data2['rf_songjeong_sum'].fillna(ppd_data2['rf_songjeong'].cumsum())

In [None]:
ppd_data2.drop(['rf_zingwan','rf_daegog','rf_songjeong'],axis=1,inplace=True)
ppd_data2

In [None]:
filtered_data=ppd_data2[ (ppd_data2['wl_jamsu']>500)][['wl_jamsu','rf_songjeong_sum','rf_daegog_sum','rf_zingwan_sum']]

filtered_index = filtered_data.index

# 이전 3개의 행에 해당하는 인덱스 계산
previous_indices = filtered_index.union(filtered_index - pd.DateOffset(minutes=10))
previous_indices = previous_indices.union(filtered_index - pd.DateOffset(minutes=20))
previous_indices = previous_indices.union(filtered_index - pd.DateOffset(minutes=30))

# 이전 3개의 행 가져오기
previous_data = ppd_data2.loc[previous_indices]

# 결과 출력
a=previous_data[['wl_jamsu','rf_songjeong_sum','rf_daegog_sum','rf_zingwan_sum']]
a

### 이전 결과와 비교

In [None]:
leadtime1440=LeadtimeModel(ppd_data,1440)
leadtime1440.model_S

In [None]:
leadtime1440_2=LeadtimeModel(ppd_data2,1440)
leadtime1440_2.train(save=False)
leadtime1440_2.model_S

In [None]:
leadtime1440.show_plot(leadtime1440.model_S)

In [None]:
leadtime1440_2.show_plot(leadtime1440_2.model_S)