### GA RMSE

In [1]:
import numpy as np

def roulette_wheel_selection(population, fitness_scores):
    """
    룰렛 휠 선택 방식으로 부모 개체를 선택
    Args:
        population: 현재 개체군 (가중치 배열)
        fitness_scores: 각 개체의 RMSE (낮을수록 좋음)
    Returns:
        선택된 부모 개체
    """
    fitness_inverse = 1 / (fitness_scores + 1e-6)  # 역으로 변환해 높은 적합도 부여
    total_fitness = np.sum(fitness_inverse)
    probabilities = fitness_inverse / total_fitness
    selected_index = np.random.choice(len(population), p=probabilities)
    return population[selected_index]

def genetic_algorithm_with_elitism(model_predictions, true_values, population_size=100, generations=100, mutation_rate=0.1):
    """
    GA를 사용하여 최적의 가중치 찾기
    Args:
        model_predictions: (n_samples, models, time_window, 2) 형태의 모델 예측값
        true_values: (n_samples, time_window, 2) 형태의 실제값
        population_size: 초기 개체군 크기
        generations: 세대 수
        mutation_rate: 돌연변이 확률
    Returns:
        최적 가중치 배열
    """
    num_models = model_predictions.shape[1]  # 모델 수
    population = np.random.dirichlet(np.ones(num_models), size=population_size)  # 초기 가중치 개체군

    for generation in range(generations):
        fitness_scores = []

        # 현재 population의 fitness (RMSE) 계산
        for individual in population:
            ensemble_prediction = np.sum(
                [weight * model_predictions[:, i, :, :] for i, weight in enumerate(individual)],
                axis=0
            )
            rmse = np.sqrt(np.mean((true_values - ensemble_prediction) ** 2))
            fitness_scores.append(rmse)
        
        fitness_scores = np.array(fitness_scores)

        # 새로운 population 생성
        new_population = []
        while len(new_population) < population_size:
            # 부모 선택
            parent1 = roulette_wheel_selection(population, fitness_scores)
            parent2 = roulette_wheel_selection(population, fitness_scores)

            # 교차 연산
            crossover_point = np.random.randint(1, num_models)
            child = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))

            # 돌연변이
            if np.random.rand() < mutation_rate:
                mutation_vector = np.random.normal(0, 0.1, size=num_models)
                child = np.clip(child + mutation_vector, 0, 1)

            # 정규화
            child = child / np.sum(child)
            new_population.append(child)
        
        new_population = np.array(new_population)

        # 엘리티즘 적용: 상위 50% 개체 선택
        combined_population = np.vstack((population, new_population))
        combined_fitness = np.concatenate((fitness_scores, np.zeros(len(new_population))))

        for idx, individual in enumerate(new_population):
            ensemble_prediction = np.sum(
                [weight * model_predictions[:, i, :, :] for i, weight in enumerate(individual)],
                axis=0
            )
            combined_fitness[len(fitness_scores) + idx] = np.sqrt(np.mean((true_values - ensemble_prediction) ** 2))

        sorted_indices = np.argsort(combined_fitness)
        population = combined_population[sorted_indices][:population_size]

        print(f"Generation {generation + 1}/{generations}, Best RMSE: {combined_fitness[sorted_indices][0]}")

    # 최적 개체 반환
    best_weights = population[0]
    print(f"Optimal Weights: {best_weights}")
    return best_weights


### 한달씩 GA적용해 가중치 찾기

In [9]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import differential_evolution

def load_and_process_validation_data(input_directory, month, models=5, time_window=48):
    """
    Validation 데이터를 로드하고, 모델 예측값과 True 값을 추출해 시간 단위로 이어붙입니다.

    Args:
        input_directory (str): 모델별 validation 데이터가 있는 디렉토리
        month (int): 처리할 월
        models (int): 모델 개수 (default=5)
        time_window (int): 샘플당 시간 창 (default=48)

    Returns:
        model_predictions (np.ndarray): (n_samples, models, time_window, 2)
        true_values (np.ndarray): (n_samples, time_window, 2)
    """
    model_predictions = []  # 각 모델의 예측값 저장

    for model_idx in range(1, models + 1):
        file_name = f"val_month_{month}_model_{model_idx}_results.csv"
        file_path = os.path.join(input_directory, f"model{model_idx}", file_name)
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        df = pd.read_csv(file_path)

        # 예측값 가져오기
        pred_u = df[f"Model {model_idx} Val Pred U"].values
        pred_v = df[f"Model {model_idx} Val Pred V"].values

        # True 값은 모델 1에서만 추출 (중복 방지)
        if model_idx == 1:
            true_u = df["True U"].values
            true_v = df["True V"].values

        # 48시간 단위로 나누기
        n_samples = len(pred_u) // time_window
        pred_u = pred_u[:n_samples * time_window].reshape(n_samples, time_window, 1)
        pred_v = pred_v[:n_samples * time_window].reshape(n_samples, time_window, 1)

        model_predictions.append(np.concatenate([pred_u, pred_v], axis=2))  # (n_samples, time_window, 2)

        # True 값 병합 (한 번만 실행)
        if model_idx == 1:
            true_u = true_u[:n_samples * time_window].reshape(n_samples, time_window, 1)
            true_v = true_v[:n_samples * time_window].reshape(n_samples, time_window, 1)
            true_values = np.concatenate([true_u, true_v], axis=2)  # (n_samples, time_window, 2)

    # 모델 예측값 형태 변환: (n_samples, models, time_window, 2)
    model_predictions = np.stack(model_predictions, axis=1)

    return model_predictions, true_values

In [10]:
def load_and_process_test_data(input_directory, month, models=5, time_window=48):
    """
    Test 데이터를 로드하고, 모델 예측값과 True 값을 추출해 시간 단위로 이어붙이기

    Args:
        input_directory (str): 모델별 test 데이터가 있는 디렉토리
        month (int): 처리할 월
        models (int): 모델 개수 (default=5)
        time_window (int): 샘플당 시간 창 (default=48)

    Returns:
        model_predictions (np.ndarray): (n_samples, models, time_window, 2)
        true_values (np.ndarray): (n_samples, time_window, 2)
    """
    model_predictions = []  # 모델별 예측값 저장

    for model_idx in range(1, models + 1):
        file_name = f"test_month_{month}_model_{model_idx}_results.csv"
        file_path = os.path.join(input_directory, f"model{model_idx}", file_name)
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        df = pd.read_csv(file_path)

        # 모델 예측값 추출
        pred_u = df[f"Model {model_idx} Test Pred U"].values
        pred_v = df[f"Model {model_idx} Test Pred V"].values

        # True 값은 모델 1에서만 추출 (중복 방지)
        if model_idx == 1:
            true_u = df["True U"].values
            true_v = df["True V"].values

        # 48시간 단위로 자르기
        n_samples = len(pred_u) // time_window
        pred_u = pred_u[:n_samples * time_window].reshape(n_samples, time_window, 1)
        pred_v = pred_v[:n_samples * time_window].reshape(n_samples, time_window, 1)

        model_predictions.append(np.concatenate([pred_u, pred_v], axis=2))  # (n_samples, time_window, 2)

        # True 값 병합 (한 번만 실행)
        if model_idx == 1:
            true_u = true_u[:n_samples * time_window].reshape(n_samples, time_window, 1)
            true_v = true_v[:n_samples * time_window].reshape(n_samples, time_window, 1)
            true_values = np.concatenate([true_u, true_v], axis=2)

    # 모델 예측값 형태 변환: (n_samples, models, time_window, 2)
    model_predictions = np.stack(model_predictions, axis=1)

    return model_predictions, true_values

In [12]:
from sklearn.metrics import mean_squared_error
def calculate_ensemble_rmse(model_predictions, true_values, weights):
    """
    최적 가중치를 사용하여 앙상블 RMSE를 계산

    Args:
        model_predictions (np.ndarray): 모델 예측값 (n_samples, models, time_window, 2)
        true_values (np.ndarray): 실제 값 (n_samples, time_window, 2)
        weights (np.ndarray): 최적 가중치 (models,)

    Returns:
        ensemble_rmse_dict (dict): 시간별 RMSE 딕셔너리
        overall_rmse (float): 전체 평균 RMSE
    """
    n_samples, n_models, time_window, _ = model_predictions.shape

    # 앙상블 예측값 계산 (가중치 적용)
    ensemble_predictions = np.tensordot(model_predictions, weights, axes=(1, 0))  # (n_samples, time_window, 2)

    # 시간별 RMSE를 저장할 딕셔너리
    ensemble_rmse_dict = {}

    for time_step in range(time_window):
        # 시간별 예측값과 실제값 추출
        preds = ensemble_predictions[:, time_step, :]
        trues = true_values[:, time_step, :]

        # RMSE 계산
        mse = mean_squared_error(trues, preds, multioutput='uniform_average')
        rmse = np.sqrt(mse)
        ensemble_rmse_dict[time_step] = round(rmse, 4)

    # 전체 평균 RMSE 계산
    overall_rmse = np.mean(list(ensemble_rmse_dict.values()))

    return ensemble_rmse_dict, round(overall_rmse, 4)

In [13]:
weights_per_month = []
rmse_results = []
input_directory = "ga_s3_1211"

In [14]:
for month in range(1, 13):
    # Validation 데이터 로드 및 가중치 계산
    model_predictions_val, true_values_val = load_and_process_validation_data(input_directory, month)
    weights = genetic_algorithm_with_elitism(model_predictions_val, true_values_val)
    weights_per_month.append(weights)

    print(f"Optimized Weights for Month {month}:", weights)

    # 테스트 데이터 로드 및 예측
    model_predictions_test, true_values_test = load_and_process_test_data(input_directory, month)

    # 앙상블 RMSE 계산
    ensemble_rmse_dict, overall_rmse = calculate_ensemble_rmse(model_predictions_test, true_values_test, weights)
    rmse_results.append((ensemble_rmse_dict, overall_rmse))

    print(f"Month {month} RMSE per Time Step:", ensemble_rmse_dict)
    print(f"Month {month} Overall RMSE:", overall_rmse)

# 가중치 및 RMSE 결과 저장
weights_df = pd.DataFrame(weights_per_month, columns=[f"Model_{i+1}_Weight" for i in range(len(weights_per_month[0]))])
weights_df["Month"] = [f"Month_{i+1}" for i in range(12)]
weights_df.to_csv('weights_per_month.csv', index=False)

rmse_df = pd.DataFrame(
    [
        {
            "Month": f"Month_{i+1}",
            **{f"Time_{j}": rmse for j, rmse in rmse_dict.items()},
            "Overall_RMSE": overall_rmse
        }
        for i, (rmse_dict, overall_rmse) in enumerate(rmse_results)
    ]
)
rmse_df.to_csv('rmse_results.csv', index=False)

print("All monthly weights and RMSE results saved to CSV files.")


Generation 1/100, Best RMSE: 2.2460976640703136
Generation 2/100, Best RMSE: 2.2460976640703136
Generation 3/100, Best RMSE: 2.245675377745121
Generation 4/100, Best RMSE: 2.245269842802991
Generation 5/100, Best RMSE: 2.245269842802991
Generation 6/100, Best RMSE: 2.245213901692857
Generation 7/100, Best RMSE: 2.2448820829134624
Generation 8/100, Best RMSE: 2.2448820829134624
Generation 9/100, Best RMSE: 2.243940128628167
Generation 10/100, Best RMSE: 2.243940128628167
Generation 11/100, Best RMSE: 2.243940128628167
Generation 12/100, Best RMSE: 2.243877896348772
Generation 13/100, Best RMSE: 2.2438568018918557
Generation 14/100, Best RMSE: 2.243838567645796
Generation 15/100, Best RMSE: 2.243801350418525
Generation 16/100, Best RMSE: 2.243801350418525
Generation 17/100, Best RMSE: 2.243801350418525
Generation 18/100, Best RMSE: 2.243801350418525
Generation 19/100, Best RMSE: 2.243796386802421
Generation 20/100, Best RMSE: 2.2437963633655458
Generation 21/100, Best RMSE: 2.24379616248

### Uniform RMSE

In [15]:
def calculate_uniform_ensemble_rmse(model_predictions, true_values):
    """
    동일 가중치 (0.2)로 앙상블 RMSE를 계산

    Args:
        model_predictions (np.ndarray): 모델 예측값 (n_samples, models, time_window, 2)
        true_values (np.ndarray): 실제 값 (n_samples, time_window, 2)

    Returns:
        ensemble_rmse_dict (dict): 시간별 RMSE 딕셔너리
        overall_rmse (float): 전체 평균 RMSE
    """
    n_samples, n_models, time_window, _ = model_predictions.shape
    
    # 동일 가중치 생성 (모든 모델에 대해 0.2로 설정)
    uniform_weights = np.full(n_models, 1 / n_models)  # [0.2, 0.2, 0.2, 0.2, 0.2]

    # 앙상블 예측값 계산 (가중치 적용)
    ensemble_predictions = np.tensordot(model_predictions, uniform_weights, axes=(1, 0))  # (n_samples, time_window, 2)

    # 시간별 RMSE를 저장할 딕셔너리
    ensemble_rmse_dict = {}
    for time_step in range(time_window):
        # 시간별 예측값과 실제값 추출
        preds = ensemble_predictions[:, time_step, :]
        trues = true_values[:, time_step, :]

        # RMSE 계산
        mse = mean_squared_error(trues, preds, multioutput='uniform_average')
        rmse = np.sqrt(mse)
        ensemble_rmse_dict[time_step] = round(rmse, 4)

    # 전체 평균 RMSE 계산
    overall_rmse = np.mean(list(ensemble_rmse_dict.values()))

    return ensemble_rmse_dict, round(overall_rmse, 4)


In [16]:
input_directory = "ga_s3_1211"
uniform_rmse_results = []

for month in range(1, 13):
    # 테스트 데이터 로드
    model_predictions, true_values = load_and_process_test_data(input_directory, month)

    # Uniform 앙상블 RMSE 계산
    uniform_rmse_dict, overall_uniform_rmse = calculate_uniform_ensemble_rmse(model_predictions, true_values)

    # 결과 저장
    uniform_rmse_results.append({
        "Month": f"Month_{month}",
        **{f"Time_{time_step}": rmse for time_step, rmse in uniform_rmse_dict.items()},
        "Overall_RMSE": overall_uniform_rmse
    })

    print(f"Month {month} Uniform Ensemble RMSE:", uniform_rmse_dict)
    print(f"Month {month} Overall Uniform RMSE: {overall_uniform_rmse}")

# Uniform RMSE 결과를 CSV로 저장
uniform_rmse_df = pd.DataFrame(uniform_rmse_results)
uniform_rmse_df.to_csv("uniform_rmse_results.csv", index=False)
print("Uniform RMSE 결과가 uniform_rmse_results.csv에 저장되었습니다.")

Month 1 Uniform Ensemble RMSE: {0: 1.3568, 1: 1.5291, 2: 1.839, 3: 1.9232, 4: 1.7602, 5: 1.8073, 6: 1.8303, 7: 1.9827, 8: 1.8948, 9: 1.9772, 10: 2.0431, 11: 2.05, 12: 1.8393, 13: 2.0254, 14: 2.3006, 15: 1.7512, 16: 1.5818, 17: 1.9206, 18: 1.8047, 19: 1.9474, 20: 2.0783, 21: 1.7997, 22: 1.7628, 23: 1.7257, 24: 1.7783, 25: 1.8349, 26: 2.1735, 27: 2.0437, 28: 1.9757, 29: 1.8247, 30: 1.8613, 31: 1.9959, 32: 2.0315, 33: 2.1871, 34: 2.3082, 35: 2.3615, 36: 2.2142, 37: 2.2777, 38: 2.5443, 39: 2.1045, 40: 1.8867, 41: 2.1583, 42: 2.1427, 43: 2.3895, 44: 2.3272, 45: 2.1278, 46: 2.0978, 47: 2.3519}
Month 1 Overall Uniform RMSE: 1.9902
Month 2 Uniform Ensemble RMSE: {0: 1.2072, 1: 1.1723, 2: 1.4427, 3: 1.2589, 4: 1.3412, 5: 1.5198, 6: 1.4521, 7: 1.4345, 8: 1.4871, 9: 1.7484, 10: 2.0442, 11: 2.573, 12: 2.6423, 13: 2.412, 14: 2.3664, 15: 2.5534, 16: 2.664, 17: 2.4691, 18: 2.5296, 19: 2.5834, 20: 2.7808, 21: 2.5075, 22: 2.2321, 23: 2.1948, 24: 1.9467, 25: 1.7848, 26: 1.8433, 27: 1.6932, 28: 1.6095, 2

In [21]:
# 시간별 RMSE의 월별 평균 및 전체 평균 계산
average_time_rmse = uniform_rmse_df.iloc[:, 1:-1].mean(axis=0)
overall_average_rmse = average_time_rmse.mean()

# 결과 출력
print("시간별 RMSE의 월별 평균:")
print(average_time_rmse)
print(f"Uniform 전체 48시간 평균 RMSE: {overall_average_rmse}")

# 결과를 CSV로 저장
# average_time_rmse.to_csv("average_time_rmse.csv", header=["Average_RMSE"], index_label="Time_Step")
# with open("overall_average_rmse.txt", "w") as f:
#     f.write(f"Overall Average RMSE: {overall_average_rmse}\n")

시간별 RMSE의 월별 평균:
Time_0     1.469183
Time_1     1.628275
Time_2     1.666442
Time_3     1.706700
Time_4     1.641383
Time_5     1.686233
Time_6     1.702158
Time_7     1.851858
Time_8     1.867183
Time_9     1.903542
Time_10    1.974325
Time_11    2.038483
Time_12    1.995117
Time_13    2.005942
Time_14    2.068108
Time_15    2.036292
Time_16    2.038483
Time_17    2.137967
Time_18    2.225258
Time_19    2.466392
Time_20    2.467267
Time_21    2.315558
Time_22    2.127433
Time_23    2.086967
Time_24    2.123892
Time_25    2.162342
Time_26    2.044525
Time_27    2.040650
Time_28    2.046858
Time_29    2.025358
Time_30    2.007358
Time_31    2.117850
Time_32    2.206475
Time_33    2.286983
Time_34    2.365383
Time_35    2.387767
Time_36    2.314025
Time_37    2.254542
Time_38    2.415800
Time_39    2.464025
Time_40    2.371158
Time_41    2.414867
Time_42    2.560967
Time_43    2.759608
Time_44    2.758533
Time_45    2.658983
Time_46    2.598683
Time_47    2.685617
dtype: float64
Uniform 

---

### 12개월 평균 가중치로 최종 가중치 구하기

In [17]:
import pandas as pd

# 가중치 데이터 로드
weights_df = pd.read_csv("weights_per_month.csv")

# 각 모델의 평균 가중치 계산
average_weights = weights_df.iloc[:, :-1].mean(axis=0)  # 'Month' 열 제외

# 결과 출력
print("12개월 동안의 평균 가중치:")
print(average_weights)

# 데이터프레임으로 저장
average_weights_df = average_weights.reset_index()
average_weights_df.columns = ["Model", "Average Weight"]

# 결과를 CSV로 저장
# average_weights_df.to_csv("average_weights.csv", index=False)


12개월 동안의 평균 가중치:
Model_1_Weight    0.165995
Model_2_Weight    0.712146
Model_3_Weight    0.034547
Model_4_Weight    0.060618
Model_5_Weight    0.026694
dtype: float64


### 위의 평균 가중치 적용해 최종 RMSE 계산

In [25]:
average_weights = [0.165995, 0.712146, 0.034547, 0.060618, 0.026694]
final_results = []
input_directory = "ga_s3_1211"

for month in range(1, 13):
    # 테스트 데이터 로드
    model_predictions, true_values = load_and_process_test_data(input_directory, month)

    # 앙상블 예측값 계산 (평균 가중치 적용)
    ensemble_predictions = np.tensordot(model_predictions, average_weights, axes=(1, 0))  # (n_samples, time_window, 2)

    # 시간별 RMSE 계산
    time_window = ensemble_predictions.shape[1]
    ensemble_rmse_dict = {}
    for time_step in range(time_window):
        preds = ensemble_predictions[:, time_step, :]
        trues = true_values[:, time_step, :]

        mse = mean_squared_error(trues, preds, multioutput='uniform_average')
        rmse = np.sqrt(mse)
        ensemble_rmse_dict[f"Time_{time_step}"] = round(rmse, 4)

    # 전체 평균 RMSE 계산
    overall_rmse = np.mean(list(ensemble_rmse_dict.values()))

    # 결과 저장
    final_results.append({
        "Month": f"Month_{month}",
        **ensemble_rmse_dict,
        "Overall_RMSE": round(overall_rmse, 4)
    })

    print(f"Month {month} 시간별 RMSE:", ensemble_rmse_dict)
    print(f"Month {month} Overall RMSE: {round(overall_rmse, 4)}")

# 결과를 CSV로 저장
final_results_df = pd.DataFrame(final_results)
final_results_df.to_csv("final_rmse_results.csv", index=False)
print("최종 RMSE 결과가 final_rmse_results.csv에 저장되었습니다.")

# 시간별 RMSE 평균 및 전체 평균 계산
time_columns = [col for col in final_results_df.columns if col.startswith("Time_")]
time_rmse_avg = final_results_df[time_columns].mean(axis=0)
overall_avg_rmse = time_rmse_avg.mean()

# 결과 출력
print("시간별 RMSE 평균:")
print(time_rmse_avg)
print(f"GA 적용 전체 평균 RMSE: {overall_avg_rmse}")

# 결과 저장
# time_rmse_avg.to_csv("average_time_rmse.csv", header=["Average RMSE"], index_label="Time")
# with open("overall_average_rmse.txt", "w") as f:
#     f.write(f"Overall Average RMSE: {overall_avg_rmse}\n")

# print("시간별 평균 RMSE가 average_time_rmse.csv에 저장되었습니다.")
# print("전체 평균 RMSE가 overall_average_rmse.txt에 저장되었습니다.")


Month 1 시간별 RMSE: {'Time_0': 1.3884, 'Time_1': 1.5013, 'Time_2': 1.7896, 'Time_3': 1.869, 'Time_4': 1.6338, 'Time_5': 1.7531, 'Time_6': 1.6927, 'Time_7': 1.9069, 'Time_8': 1.7507, 'Time_9': 1.8117, 'Time_10': 1.9341, 'Time_11': 1.8618, 'Time_12': 1.721, 'Time_13': 1.8538, 'Time_14': 2.0994, 'Time_15': 1.5897, 'Time_16': 1.6011, 'Time_17': 1.9977, 'Time_18': 1.8516, 'Time_19': 1.9691, 'Time_20': 2.0956, 'Time_21': 1.8409, 'Time_22': 1.8128, 'Time_23': 1.7428, 'Time_24': 1.8365, 'Time_25': 1.8461, 'Time_26': 2.1833, 'Time_27': 2.0136, 'Time_28': 1.8267, 'Time_29': 1.7566, 'Time_30': 1.7296, 'Time_31': 1.8499, 'Time_32': 1.8368, 'Time_33': 1.9711, 'Time_34': 2.2142, 'Time_35': 2.216, 'Time_36': 2.15, 'Time_37': 2.2299, 'Time_38': 2.451, 'Time_39': 2.0191, 'Time_40': 1.7944, 'Time_41': 2.0676, 'Time_42': 2.0355, 'Time_43': 2.2335, 'Time_44': 2.1139, 'Time_45': 1.9002, 'Time_46': 1.8999, 'Time_47': 2.1363}
Month 1 Overall RMSE: 1.9038
Month 2 시간별 RMSE: {'Time_0': 1.0983, 'Time_1': 1.0381, '