In [53]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from types import SimpleNamespace
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
config = {
    "learning_rate": 2e-5,
    "epoch": 30,
    "batch_size": 64,
    "hidden_size": 64,
    "num_layers": 2,
    "output_size": 3
}

CFG = SimpleNamespace(**config)

품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']

# Define Function for Feature Engineering
- 타겟의 필터 조건을 제외한 메타데이터의 필터 조건은 참가자들 각자의 기준에 맞춰 자유롭게 사용가능 
- 밑의 필터 조건은 임의로 제공하는 예시

In [34]:

def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scaler=None):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        '공판장': None, 
        '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성 및 병합
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)

    # 공판장 데이터 병합
    if conditions[품목명]['공판장']:
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 병합
    if conditions[품목명]['도매']:
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 결측값 처리 및 수치형 데이터 정규화
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)
    if scaler is None:
        scaler = MinMaxScaler()
        filtered_data[numeric_columns] = scaler.fit_transform(filtered_data[numeric_columns])
    else:
        filtered_data[numeric_columns] = scaler.transform(filtered_data[numeric_columns])

    return filtered_data, scaler

# Define Custom Dataset Class

In [41]:
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        
        # '평균가격(원)' 관련 열을 찾는 로직 개선
        price_columns = [col for col in self.data.columns if '평균가격(원)' in col]
        
        if len(price_columns) == 1:
            # '평균가격(원)' 열이 1개일 때 사용
            self.price_column = price_columns[0]
        elif len(price_columns) > 1:
            # '평균가격(원)' 열이 여러 개일 때 우선적으로 사용할 열을 명시적으로 지정
            print(f"여러 개의 '평균가격(원)' 열이 발견되었습니다: {price_columns}")
            # 예를 들어, 첫 번째 열을 사용한다고 가정
            self.price_column = price_columns[0]
            print(f"첫 번째 열을 선택합니다: {self.price_column}")
        else:
            raise ValueError("데이터프레임에 '평균가격(원)'과 관련된 열이 존재하지 않습니다.")
        
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()
        
        self.sequences = []
        if not self.is_test:
            # 가능한 시퀀스를 추출
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            # 테스트 데이터일 때는 시퀀스만 생성 (예측할 타겟 데이터는 없음)
            if len(self.data) >= self.window_size:
                self.sequences = [self.data[self.numeric_columns].iloc[-self.window_size:].values]
            else:
                raise ValueError("테스트 데이터의 길이가 window_size보다 작습니다.")
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            # 테스트 데이터에서는 타겟 없이 입력 시퀀스만 반환
            x = self.sequences[idx]
            return torch.FloatTensor(x)



In [43]:
import torch
import torch.nn as nn
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split


# LSTM 모델 정의
class PricePredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, lstm_output_size):
        super(PricePredictionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm_output_size = lstm_output_size
        
        # LSTM 레이어
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # LSTM의 마지막 출력을 위한 선형 레이어
        self.fc = nn.Linear(hidden_size, lstm_output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # 마지막 타임스텝의 출력만 사용
        return out


# 모델 평가 함수
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x = batch_x.to(next(model.parameters()).device)
            batch_y = batch_y.to(next(model.parameters()).device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    return total_loss / len(dataloader)



# LGBM과 LSTM의 Hybrid 모델을 위한 데이터 준비
def prepare_hybrid_data(lstm_model, dataloader):
    lstm_model.eval()
    lstm_features = []
    labels = []
    
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x = batch_x.to(next(lstm_model.parameters()).device)
            lstm_output = lstm_model(batch_x)
            lstm_features.append(lstm_output.cpu().numpy())  # LSTM의 출력을 저장
            labels.append(batch_y.cpu().numpy())
    
    lstm_features = np.concatenate(lstm_features, axis=0)
    labels = np.concatenate(labels, axis=0).flatten()  # Ensure labels are 1-D
    
    return lstm_features, labels

# LGBM 학습 함수
def train_lgbm(lstm_model, train_loader, test_loader, num_boost_round=100, early_stopping_rounds=10):
    # LSTM을 사용해 시계열 데이터를 처리하고 특징을 추출
    X_train, y_train = prepare_hybrid_data(lstm_model, train_loader)
    X_test, y_test = prepare_hybrid_data(lstm_model, test_loader)
    
    # Ensure y_train and y_test are 1-D arrays
    y_train = y_train.flatten()  # Ensure it's a 1-D array
    y_test = y_test.flatten()  # Ensure it's a 1-D array
    
    # LGBM 데이터 세트 생성
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    
    # LGBM 하이퍼파라미터 설정
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.9
    }

    # Manually implementing early stopping
    best_val_loss = float('inf')
    best_iteration = 0
    lgbm_model = None  # Initialize model to None

    for iteration in range(num_boost_round):
        lgbm_model = lgb.train(params, train_data, num_boost_round=1, init_model=lgbm_model)
        val_loss = lgbm_model.best_score['valid']['rmse']  # Access the validation score
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_iteration = iteration + 1  # +1 because iteration is zero-based

        
        if iteration + 1 - best_iteration >= early_stopping_rounds:
            print("Early stopping triggered.")
            break
        
    return lgbm_model


# 최종 예측
def predict_hybrid(lstm_model, lgbm_model, dataloader):
    lstm_model.eval()
    lstm_features, _ = prepare_hybrid_data(lstm_model, dataloader)
    
    # LGBM을 통해 최종 예측 수행
    predictions = lgbm_model.predict(lstm_features)
    return predictions


def train_hybrid_model(lstm_model, train_loader, num_epochs, criterion, optimizer):
    lstm_features, lstm_labels = [], []
    for epoch in range(num_epochs):
        lstm_model.train()
        total_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(next(lstm_model.parameters()).device), batch_y.to(next(lstm_model.parameters()).device)
            optimizer.zero_grad()
            outputs = lstm_model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            lstm_features.append(outputs.cpu().detach().numpy())
            lstm_labels.append(batch_y.cpu().detach().numpy())
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}')
    
    return lstm_features, lstm_labels


In [54]:
pbar_outer = tqdm(품목_리스트, desc="품목 처리 중", position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(f"품목별 전처리 및 모델 학습 -> {품목명}")
    
    # 데이터 전처리 및 스케일링
    train_data, scaler = process_data("./Data/train/train.csv", 
                                      "./Data/train/meta/TRAIN_산지공판장_2018-2021.csv", 
                                      "./Data/train/meta/TRAIN_전국도매_2018-2021.csv", 
                                      품목명)
    품목별_scalers[품목명] = scaler
    dataset = AgriculturePriceDataset(train_data)
    
    # train과 validation으로 분할
    train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    
    train_loader = DataLoader(train_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    input_size = len(dataset.numeric_columns)
    
    # LSTM 모델 생성
    lstm_model = PricePredictionLSTM(input_size, CFG.hidden_size, CFG.num_layers, lstm_output_size=1)
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(lstm_model.parameters(), CFG.learning_rate)
    
    best_val_loss = float('inf')
    os.makedirs('models', exist_ok=True)
    
    # LSTM 학습
    lstm_features, lstm_labels = [], []
    for epoch in range(CFG.epoch):
        lstm_feat_epoch, lstm_labels_epoch = train_hybrid_model(lstm_model, train_loader, epoch, criterion, optimizer)
        
        # 피처와 라벨의 길이 체크
        assert len(lstm_feat_epoch) == len(lstm_labels_epoch), "피처와 라벨의 길이가 일치하지 않습니다."

        val_loss = evaluate_model(lstm_model, val_loader, criterion)

        lstm_features.extend(lstm_feat_epoch)
        lstm_labels.extend(lstm_labels_epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(lstm_model.state_dict(), f'models/best_lstm_model_{품목명}.pth')

        print(f'Epoch {epoch+1}/{CFG.epoch}, Val Loss: {val_loss:.4f}')

    print(f'Best Validation Loss for {품목명}: {best_val_loss:.4f}')

    # LSTM 모델의 피처와 라벨을 사용하여 LGBM 학습
    lstm_features = np.concatenate(lstm_features)
    lstm_labels = np.concatenate(lstm_labels)

    # 학습용 피처와 라벨 길이 확인
    assert len(lstm_features) == len(lstm_labels), "LGBM 학습을 위한 피처와 라벨의 길이가 일치하지 않습니다."

    # LGBM 모델 학습
    lgbm_model = train_lgbm(lstm_features, lstm_labels, test_loader=val_loader)
    
    품목_predictions = []

    ### 추론
    pbar_inner = tqdm(range(25), desc="테스트 파일 추론 중", position=1, leave=False)
    for i in pbar_inner:
        test_file = f"./Data/test/TEST_{i:02d}.csv"
        산지공판장_file = f"./Data/test/meta/TEST_산지공판장_{i:02d}.csv"
        전국도매_file = f"./Data/test/meta/TEST_전국도매_{i:02d}.csv"

        # 데이터 전처리 및 스케일링
        test_data, _ = process_data(test_file, 산지공판장_file, 전국도매_file, 품목명, scaler=품목별_scalers[품목명])
        test_dataset = AgriculturePriceDataset(test_data, is_test=True)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        # LSTM 모델 평가 모드로 전환
        if isinstance(lstm_model, torch.nn.Module):  # lstm_model이 torch.nn.Module인지 확인
            lstm_model.eval()  # LSTM 모델을 평가 모드로 전환
        else:
            raise TypeError("lstm_model이 numpy 배열로 잘못 변환되었습니다. PyTorch 모델이어야 합니다.")

        lstm_test_features = []
        
        with torch.no_grad():  # 평가 시에는 no_grad로 메모리 절약
            for batch in test_loader:
                # 배치 데이터를 모델이 있는 디바이스로 전송 (GPU 또는 CPU)
                batch = batch.to(next(lstm_model.parameters()).device)

                # 모델 추론 (예측)
                lstm_output = lstm_model(batch)
                lstm_test_features.append(lstm_output.cpu().numpy())  # numpy로 변환하여 저장

        # LGBM을 사용한 예측
        lstm_test_features = np.concatenate(lstm_test_features)
        predictions = lgbm_model.predict(lstm_test_features)

        # 예측값을 원래 스케일로 복원
        predictions_reshaped = predictions.reshape(-1, 1)

        # 스케일링된 값을 원래 값으로 복원
        price_column_index = test_data.columns.get_loc(test_dataset.price_column)
        price_scaler = MinMaxScaler()
        price_scaler.min_ = 품목별_scalers[품목명].min_[price_column_index]
        price_scaler.scale_ = 품목별_scalers[품목명].scale_[price_column_index]
        predictions_original_scale = price_scaler.inverse_transform(predictions_reshaped)

        # NaN 값이 있는지 확인하고 처리
        if np.isnan(predictions_original_scale).any():
            pbar_inner.set_postfix({"상태": "NaN"})
        else:
            pbar_inner.set_postfix({"상태": "정상"})
            품목_predictions.extend(predictions_original_scale.flatten())

    품목별_predictions[품목명] = 품목_predictions
    pbar_outer.update(1)


품목 처리 중:   0%|          | 0/10 [00:00<?, ?it/s]

여러 개의 '평균가격(원)' 열이 발견되었습니다: ['평년 평균가격(원)', '평균가격(원)', '햇산양건_30 kg_상품_평년 평균가격(원)', '햇산양건_30 kg_상품_평균가격(원)', '햇산화건_30 kg_중품_평년 평균가격(원)', '햇산화건_30 kg_중품_평균가격(원)', '햇산화건_30 kg_상품_평년 평균가격(원)', '햇산화건_30 kg_상품_평균가격(원)', '양건_30 kg_중품_평년 평균가격(원)', '양건_30 kg_중품_평균가격(원)', '양건_30 kg_상품_평년 평균가격(원)', '양건_30 kg_상품_평균가격(원)', '화건_30 kg_중품_평년 평균가격(원)', '화건_30 kg_중품_평균가격(원)', '햇산양건_30 kg_중품_평년 평균가격(원)', '햇산양건_30 kg_중품_평균가격(원)']
첫 번째 열을 선택합니다: 평년 평균가격(원)
Epoch 1/30, Val Loss: 0.8227
Epoch [1/1], Loss: 0.8418
Epoch 2/30, Val Loss: 0.8214
Epoch [1/2], Loss: 0.8397
Epoch [2/2], Loss: 0.8346
Epoch 3/30, Val Loss: 0.8189
Epoch [1/3], Loss: 0.8359
Epoch [2/3], Loss: 0.8387
Epoch [3/3], Loss: 0.8331
Epoch 4/30, Val Loss: 0.8152
Epoch [1/4], Loss: 0.8303
Epoch [2/4], Loss: 0.8301
Epoch [3/4], Loss: 0.8322
Epoch [4/4], Loss: 0.8316
Epoch 5/30, Val Loss: 0.8102
Epoch [1/5], Loss: 0.8276
Epoch [2/5], Loss: 0.8283


  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch [3/5], Loss: 0.8232
Epoch [4/5], Loss: 0.8293
Epoch [5/5], Loss: 0.8255
Epoch 6/30, Val Loss: 0.8039
Epoch [1/6], Loss: 0.8200
Epoch [2/6], Loss: 0.8171
Epoch [3/6], Loss: 0.8199
Epoch [4/6], Loss: 0.8193
Epoch [5/6], Loss: 0.8196
Epoch [6/6], Loss: 0.8128
Epoch 7/30, Val Loss: 0.7963
Epoch [1/7], Loss: 0.8125
Epoch [2/7], Loss: 0.8130
Epoch [3/7], Loss: 0.8103
Epoch [4/7], Loss: 0.8125
Epoch [5/7], Loss: 0.8102
Epoch [6/7], Loss: 0.8094
Epoch [7/7], Loss: 0.8057
Epoch 8/30, Val Loss: 0.7873
Epoch [1/8], Loss: 0.8075
Epoch [2/8], Loss: 0.8017
Epoch [3/8], Loss: 0.8044
Epoch [4/8], Loss: 0.8006
Epoch [5/8], Loss: 0.7996
Epoch [6/8], Loss: 0.7994
Epoch [7/8], Loss: 0.7941
Epoch [8/8], Loss: 0.7939
Epoch 9/30, Val Loss: 0.7768
Epoch [1/9], Loss: 0.7984
Epoch [2/9], Loss: 0.7906
Epoch [3/9], Loss: 0.7934
Epoch [4/9], Loss: 0.7855
Epoch [5/9], Loss: 0.7865
Epoch [6/9], Loss: 0.7924
Epoch [7/9], Loss: 0.7876
Epoch [8/9], Loss: 0.7892
Epoch [9/9], Loss: 0.7870
Epoch 10/30, Val Loss: 0.7

AttributeError: 'numpy.ndarray' object has no attribute 'eval'

In [57]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from types import SimpleNamespace
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.arima.model import ARIMA
import os

# Hyperparameter Setting
config = {
    "learning_rate": 2e-5,
    "epoch": 30,
    "batch_size": 64,
    "hidden_size": 64,
    "num_layers": 2,
    "output_size": 3
}

CFG = SimpleNamespace(**config)

품목_리스트 = ['건고추', '사과', '감자', '배', '깐마늘(국산)', '무', '상추', '배추', '양파', '대파']


def process_data(raw_file, 산지공판장_file, 전국도매_file, 품목명, scaler=None):
    raw_data = pd.read_csv(raw_file)
    산지공판장 = pd.read_csv(산지공판장_file)
    전국도매 = pd.read_csv(전국도매_file)

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        '공판장': None, 
        '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품'),
        '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

        # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성 (생략)
    
    # 공판장 데이터 처리
    if conditions[품목명]['공판장'] is not None:  # None 체크
        filtered_공판장 = 산지공판장
        for key, value in conditions[품목명]['공판장'].items():
            if value is not None:
                filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        
        filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    if conditions[품목명]['도매'] is not None:  # None 체크
        filtered_도매 = 전국도매
        for key, value in conditions[품목명]['도매'].items():
            if value is not None:
                filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        
        filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
        filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data = filtered_data[['시점'] + list(numeric_columns)]
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)

    # 정규화 적용
    if scaler is None:
        scaler = MinMaxScaler()
        filtered_data[numeric_columns] = scaler.fit_transform(filtered_data[numeric_columns])
    else:
        filtered_data[numeric_columns] = scaler.transform(filtered_data[numeric_columns])

    return filtered_data, scaler


# Define LSTM Model
class PricePredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(PricePredictionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Define Dataset Class
class AgriculturePriceDataset(Dataset):
    def __init__(self, dataframe, window_size=9, prediction_length=3, is_test=False):
        self.data = dataframe
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.is_test = is_test
        self.price_column = [col for col in self.data.columns if '평균가격(원)' in col and len(col.split('_')) == 1][0]
        self.numeric_columns = self.data.select_dtypes(include=[np.number]).columns.tolist()
        self.sequences = []
        if not self.is_test:
            for i in range(len(self.data) - self.window_size - self.prediction_length + 1):
                x = self.data[self.numeric_columns].iloc[i:i+self.window_size].values
                y = self.data[self.price_column].iloc[i+self.window_size:i+self.window_size+self.prediction_length].values
                self.sequences.append((x, y))
        else:
            self.sequences = [self.data[self.numeric_columns].values]
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if not self.is_test:
            x, y = self.sequences[idx]
            return torch.FloatTensor(x), torch.FloatTensor(y)
        else:
            return torch.FloatTensor(self.sequences[idx])

# Train the LSTM model
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    return total_loss / len(test_loader)

# Apply ARIMA model for time series forecasting
def apply_arima(data, order=(5,1,0)):
    arima_model = ARIMA(data, order=order)
    arima_fit = arima_model.fit()
    return arima_fit

# Calculate residuals
def calculate_residuals(actual, predicted):
    residuals = actual - predicted
    return residuals

# Hybrid model combining ARIMA and LSTM
pbar_outer = tqdm(품목_리스트, desc="품목 처리 중", position=0)
for 품목명 in pbar_outer:
    pbar_outer.set_description(f"품목별 전처리 및 모델 학습 -> {품목명}")
    train_data, scaler = process_data("./Data/train/train.csv", 
                                      "./Data/train/meta/TRAIN_산지공판장_2018-2021.csv", 
                                      "./Data/train/meta/TRAIN_전국도매_2018-2021.csv", 
                                      품목명)
    
    # Split dataset for ARIMA
    train_arima_data = train_data['평균가격(원)']
    
    # 1. ARIMA 모델 적용
    arima_model = apply_arima(train_arima_data)
    arima_predictions = arima_model.fittedvalues

    # 2. 잔차 계산 (Residuals)
    residuals = calculate_residuals(train_arima_data, arima_predictions)

    # 3. LSTM 모델을 위한 데이터 전처리 (잔차 학습)
    residuals = residuals.dropna()  # ARIMA 초기 학습 기간 동안 생기는 NaN 값 제거
    residuals_scaled = scaler.fit_transform(residuals.values.reshape(-1, 1))
    dataset = AgriculturePriceDataset(pd.DataFrame(residuals_scaled), is_test=False)
    
    # Train/Validation Split for LSTM
    train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, CFG.batch_size, shuffle=False)

    input_size = len(dataset.numeric_columns)
    
    # LSTM 모델 생성
    model = PricePredictionLSTM(input_size, CFG.hidden_size, CFG.num_layers, CFG.output_size)
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), CFG.learning_rate)

    best_val_loss = float('inf')
    os.makedirs('models', exist_ok=True)

    for epoch in range(CFG.epoch):
        train_loss = train_model(model, train_loader, criterion, optimizer, CFG.epoch)
        val_loss = evaluate_model(model, val_loader, criterion)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'models/best_model_{품목명}.pth')
        
        print(f'Epoch {epoch+1}/{CFG.epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    print(f'Best Validation Loss for {품목명}: {best_val_loss:.4f}')

    # 4. 최종 예측: ARIMA 예측 + LSTM 잔차 예측
    test_file = f"./Data/test/TEST_00.csv"
    test_data, _ = process_data(test_file, "", "", 품목명, scaler=scaler)
    arima_test_predictions = arima_model.forecast(steps=len(test_data))

    test_dataset = AgriculturePriceDataset(test_data, is_test=True)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    model.eval()
    lstm_residuals = []
    with torch.no_grad():
        for batch in test_loader:
            output = model(batch)
            lstm_residuals.append(output.numpy())

    lstm_residuals_array = np.concatenate(lstm_residuals)

    # 최종 예측값 계산
    final_predictions = arima_test_predictions + lstm_residuals_array

    print(f'Final Predictions for {품목명}: {final_predictions}')


품목 처리 중:   0%|          | 0/10 [00:00<?, ?it/s]

TypeError: argument of type 'int' is not iterable