In [1]:
### 블록 1: 임포트 및 기본 함수들

import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

# 한글 폰트 설정
plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

# 데이터 읽기
base_path = '/Users/foxrainswap/Desktop/데이터/재차인원/1112/합본'
train_data = pd.read_csv(os.path.join(base_path, '트레이닝셋_정규화x_이상치제거.csv'))
test_data = pd.read_csv(os.path.join(base_path, '테스트셋_정규화x.csv'))

# 날짜 변환
train_data['날짜'] = pd.to_datetime(train_data['날짜'])
test_data['날짜'] = pd.to_datetime(test_data['날짜'])

def prepare_data(data):
    """데이터 전처리"""
    time_columns = ['04시', '05시', '06시', '07시', '08시', '09시', '10시', 
                   '11시', '12시', '13시', '14시', '15시', '16시', '17시', 
                   '18시', '19시', '20시', '21시', '22시', '23시', '00시', 
                   '01시', '02시', '03시']
    
    melted_data = pd.melt(data, 
                         id_vars=['날짜', '정류장순번', '정류장명', '요일'], 
                         value_vars=time_columns,
                         var_name='시간', 
                         value_name='재차인원')
    
    melted_data['datetime'] = pd.to_datetime(melted_data['날짜'].astype(str) + ' ' + 
                                           melted_data['시간'].str.replace('시', ':00'))
    
    # 평일/토요일/일요일 구분
    melted_data['요일구분'] = melted_data['요일'].map({
        '토요일': '토요일',
        '일요일': '일요일'
    }).fillna('평일')
    
    melted_data.sort_values(['정류장순번', 'datetime'], inplace=True)
    
    return melted_data

def create_sequences(data, seq_length):
    """시계열 데이터를 시퀀스로 변환"""
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

def create_lstm_model(seq_length):
    """LSTM 모델 생성"""
    model = Sequential([
        LSTM(100, activation='relu', input_shape=(seq_length, 1), return_sequences=True),
        LSTM(50, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

def post_process_predictions(predictions):
    """예측값 후처리"""
    processed = predictions.copy()
    processed[processed < 1] = 0  # 1미만 값을 0으로
    processed[processed < 0] = 0  # 음수 값을 0으로
    return processed

def calculate_evaluation_metrics(y_true, y_pred):
    """평가 지표 계산"""
    # 1. 먼저 예측값 후처리 (음수, 0~1 사이 값을 0으로)
    processed_pred = post_process_predictions(y_pred)
    
    # 2. 기본 지표 계산
    mae = mean_absolute_error(y_true, processed_pred)
    rmse = np.sqrt(mean_squared_error(y_true, processed_pred))
    
    # 3. SMAPE 계산
    # - 후처리된 예측값(processed_pred)을 사용
    # - 실제값과 후처리된 예측값이 모두 0인 경우는 정확히 맞춘 것으로 처리
    numerator = np.abs(y_true - processed_pred)
    denominator = np.abs(y_true) + np.abs(processed_pred)
    
    zero_mask = denominator == 0
    smape = np.mean(
        np.where(zero_mask, 0, numerator / denominator)
    ) * 200
    
    return {
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': 0,  # 사용하지 않을 MAPE
        'SMAPE': smape
    }

def convert_predictions_to_daily(predictions, test_station, time_columns):
    """예측값을 날짜별로 변환"""
    n_days = len(predictions) // len(time_columns)
    daily_predictions = predictions[:n_days * len(time_columns)].reshape(n_days, len(time_columns))
    return daily_predictions

In [2]:
### 블록 2: plot_learning_curves 함수와 analyze_all_stations 함수

def plot_learning_curves(history, station_id, station_name, day_type):
   """학습 곡선 그리기"""
   plt.figure(figsize=(10, 6))
   plt.plot(history.history['loss'], label='Training Loss')
   plt.plot(history.history['val_loss'], label='Validation Loss')
   plt.title(f'정류장 {station_id} ({station_name}) - {day_type}\n학습 곡선')
   plt.xlabel('Epoch')
   plt.ylabel('Loss')
   plt.legend()
   plt.grid(True)
   plt.tight_layout()
   
   # 그래프 저장
   save_dir = 'learning_curves'
   if not os.path.exists(save_dir):
       os.makedirs(save_dir)
   plt.savefig(f'{save_dir}/learning_curve_station_{station_id}_{day_type}.png')
   plt.close()

def analyze_all_stations(train_data, test_data, seq_length=24):
   """모든 정류장에 대한 LSTM 분석 수행"""
   time_columns = ['04시', '05시', '06시', '07시', '08시', '09시', '10시', 
                  '11시', '12시', '13시', '14시', '15시', '16시', '17시', 
                  '18시', '19시', '20시', '21시', '22시', '23시', '00시', 
                  '01시', '02시', '03시']
   
   # 운행 시간대 정의
   operating_hours = ['04시','05시','06시','07시','08시','09시','10시','11시','12시', '13시', '14시', '15시', '16시', '17시', 
                  '18시', '19시', '20시', '21시', '22시', '23시', '00시']
   
   # 첨두시간대 정의
   peak_hours = ['17시', '18시', '19시', '07시', '08시','09시']
   
   # 시간대 인덱스 구하기
   operating_idx = [time_columns.index(h) for h in operating_hours]
   peak_idx = [time_columns.index(h) for h in peak_hours]
   
   prepared_train = prepare_data(train_data)
   prepared_test = prepare_data(test_data)
   
   # Early Stopping 설정
   early_stopping = EarlyStopping(
       monitor='val_loss',
       patience=20,
       restore_best_weights=True,
       verbose=0
   )
   
   weekday_results = []
   saturday_results = []
   sunday_results = []
   evaluation_results = []
   
   all_stations = train_data['정류장순번'].unique()
   total_stations = len(all_stations)

   for idx, station_id in enumerate(all_stations, 1):
       print(f"\n정류장 처리 중... ({idx}/{total_stations})")
       station_name = train_data[train_data['정류장순번'] == station_id]['정류장명'].iloc[0]
       
       for day_type in ['평일', '토요일', '일요일']:
           try:
               # 데이터 준비
               train_station = prepared_train[
                   (prepared_train['요일구분'] == day_type) & 
                   (prepared_train['정류장순번'] == station_id)
               ].sort_values('datetime').reset_index(drop=True)
               
               test_station = prepared_test[
                   (prepared_test['요일구분'] == day_type) & 
                   (prepared_test['정류장순번'] == station_id)
               ].sort_values('datetime').reset_index(drop=True)
               
               # 시퀀스 생성
               train_values = train_station['재차인원'].values.reshape(-1, 1)
               test_values = test_station['재차인원'].values.reshape(-1, 1)
               X_train, y_train = create_sequences(train_values, seq_length)
               X_test, y_test = create_sequences(test_values, seq_length)
               
               # LSTM 모델 생성 및 학습
               model = create_lstm_model(seq_length)
               history = model.fit(
                   X_train, y_train,
                   epochs=100,
                   batch_size=32,
                   validation_split=0.2,
                   callbacks=[early_stopping],
                   verbose=0
               )
               
               # 학습 곡선 그리기
               plot_learning_curves(history, station_id, station_name, day_type)
               
               # Early Stopping으로 실제 학습된 epoch 수 출력
               epochs_trained = len(history.history['loss'])
               print(f"Epochs trained: {epochs_trained}")
               
               # 예측
               predictions = model.predict(X_test, verbose=0).flatten()
               
               # 예측값을 날짜별로 변환
               daily_predictions = convert_predictions_to_daily(predictions, test_station, time_columns)
               
               # 평균 일일 예측값 계산
               avg_predictions = np.mean(daily_predictions, axis=0)
               
               # 결과 저장
               result_dict = {
                   '정류장순번': station_id,
                   '정류장명': station_name
               }
               for time, pred in zip(time_columns, avg_predictions):
                   result_dict[time] = pred if time in operating_hours else 0
               
               # 요일별 결과 저장
               if day_type == '평일':
                   weekday_results.append(result_dict)
               elif day_type == '토요일':
                   saturday_results.append(result_dict)
               else:  # 일요일
                   sunday_results.append(result_dict)
               
               # 평가 데이터 준비
               test_values_reshaped = test_values[:len(predictions)].reshape(-1)
               
               # 운영시간대 평가
               operating_mask = np.array([i % len(time_columns) in operating_idx for i in range(len(predictions))])
               operating_predictions = predictions[operating_mask]
               operating_actual = test_values_reshaped[operating_mask]
               
               # 첨두시간대 평가
               peak_mask = np.array([i % len(time_columns) in peak_idx for i in range(len(predictions))])
               peak_predictions = predictions[peak_mask]
               peak_actual = test_values_reshaped[peak_mask]
               
               # 전체 시간대 평가
               operating_metrics = calculate_evaluation_metrics(operating_actual, operating_predictions)
               evaluation_results.append({
                   '정류장순번': station_id,
                   '정류장명': station_name,
                   '요일구분': day_type,
                   '구분': '전체',
                   **operating_metrics
               })
               
               # 첨두시간대 평가
               peak_metrics = calculate_evaluation_metrics(peak_actual, peak_predictions)
               evaluation_results.append({
                   '정류장순번': station_id,
                   '정류장명': station_name,
                   '요일구분': day_type,
                   '구분': '첨두',
                   **peak_metrics
               })
               
           except Exception as e:
               print(f"오류 발생 - 정류장: {station_id}, 구분: {day_type}")
               print(f"오류 내용: {str(e)}")
               continue
   
   # 결과를 DataFrame으로 변환
   weekday_df = pd.DataFrame(weekday_results)
   saturday_df = pd.DataFrame(saturday_results)
   sunday_df = pd.DataFrame(sunday_results)
   evaluation_df = pd.DataFrame(evaluation_results)
   
   # 결과 저장
   weekday_df.to_csv('lstm_predictions_weekday.csv', index=False)
   saturday_df.to_csv('lstm_predictions_saturday.csv', index=False)
   sunday_df.to_csv('lstm_predictions_sunday.csv', index=False)
   evaluation_df.to_csv('lstm_evaluation.csv', index=False)
   
   return weekday_df, saturday_df, sunday_df, evaluation_df

In [3]:
### 3번 블록: 실행 코드

# 분석 실행
print("\nLSTM 모델 실행 중...")
weekday_df, saturday_df, sunday_df, evaluation_df = analyze_all_stations(train_data, test_data)

print("\n분석 완료!")
print("결과 파일이 저장되었습니다:")
print("- lstm_predictions_weekday.csv")
print("- lstm_predictions_saturday.csv")
print("- lstm_predictions_sunday.csv")
print("- lstm_evaluation.csv")


LSTM 모델 실행 중...

정류장 처리 중... (1/52)
Epochs trained: 22
Epochs trained: 27
Epochs trained: 36

정류장 처리 중... (2/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (3/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (4/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (5/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (6/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (7/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (8/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (9/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (10/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (11/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (12/52)
Epochs trained: 20
Epochs trained: 20
Epochs trained: 20

정류장 처리 중... (13/52)
Epochs trained: 20
Epochs trained: 