# Investigate optimal number of areas

format: ../results/opt_num_areas/pca_kmeans_{i}.csv

In [None]:
from plotter import Plotter
plotter = Plotter('C:/Users/madss/Aarhus Universitet/CompTek/6. Semester/Bachelor/github/FEPCEU/notebooks/results/opt_num_areas')

In [None]:
data = plotter.load_csv_files('pca_kmeans', interpolate=False)
len(data)

195

In [26]:
data

Unnamed: 0,feature_group,model,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,elapsed_time_mean,elapsed_time_std,index
0,all_areas,BASELINE,320.120884,56.789735,243.175919,39.282158,3.191535e+15,4.508538e+15,-1.670175e-01,3.767701e-01,0.013662,0.009004,1
1,all_areas,LGBM,266.684386,52.513860,208.489219,42.696733,1.455611e+16,1.803161e+16,1.891980e-01,2.734375e-01,76.203726,2.017945,1
2,all_areas,ARIMA,242.967711,50.983298,209.283589,28.461245,7.577771e+18,1.069418e+19,-3.483876e+06,4.926945e+06,44.303931,6.448263,1
3,all_areas,NEURALPROPHET,262.464377,63.922764,214.328216,52.986464,9.998825e+15,1.175260e+16,1.714782e-01,4.594503e-01,124.600465,4.481718,1
4,all_areas,TimeGPT,354.507352,79.714232,293.836998,78.829859,2.732731e+16,2.945914e+16,-5.026382e-01,7.628535e-01,5.258429,1.017437,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,all_areas,BASELINE,320.120884,56.789735,243.175919,39.282158,3.191535e+15,4.508538e+15,-1.670175e-01,3.767701e-01,0.022135,0.001461,39
191,all_areas,LGBM,225.192434,41.785158,175.087811,29.523573,8.333185e+15,9.248635e+15,4.202197e-01,1.973280e-01,88.079636,0.419915,39
192,all_areas,ARIMA,222.357400,55.044662,175.234092,37.889352,1.483785e+16,1.527420e+16,4.573769e-01,2.018354e-01,417.336340,47.374562,39
193,all_areas,NEURALPROPHET,328.620791,39.619133,264.546651,34.769499,1.004428e+16,1.197010e+16,-2.068373e-01,2.565842e-01,212.069456,1.650720,39


In [30]:
plotter.plot_metrics('RMSE', 
                     f'Optimal number of areas included', 
                     y_lim=[0,450], 
                     models_to_skip=['BASELINE', 'TimeGPT'],
                     export_path='./opt_num_areas/RMSE_plot.pdf'
                    )
                    #, export_path='./RMSE_plot.pdf')

In [27]:
import pandas as pd
import numpy as np
import glob
import os
import re

def update_arima_rows(directory, pattern_prefix, interpolated_df):
    file_pattern = os.path.join(directory, pattern_prefix + '*.csv')
    files = glob.glob(file_pattern)

    print(f"Found {len(files)} files matching pattern '{pattern_prefix}*.csv'")

    metric_cols = [
        'RMSE_mean', 'RMSE_std',
        'MAE_mean', 'MAE_std',
        'MAPE_mean', 'MAPE_std',
        'R2_mean', 'R2_std',
        'elapsed_time_mean', 'elapsed_time_std'
    ]

    model_order = ['BASELINE', 'LGBM', 'ARIMA', 'NEURALPROPHET', 'TimeGPT']

    for file in files:
        basename = os.path.basename(file)
        match = re.search(r'_(\d+)', basename)

        if not match:
            print(f"Could not extract index from {basename}. Skipping.")
            continue

        index_value = int(match.group(1))
        print(f"Processing {basename} with index {index_value}")

        df_csv = pd.read_csv(file)
        column_order = df_csv.columns.tolist()

        # Extract the correct ARIMA row from interpolation
        arima_row = interpolated_df[
            (interpolated_df['model'] == 'ARIMA') &
            (interpolated_df['feature_group'] == 'all_areas') &
            (interpolated_df['index'].astype(int) == index_value)
        ]

        if arima_row.empty:
            print(f"No ARIMA data for index {index_value}. Skipping.")
            continue

        arima_row = arima_row.iloc[0]

        arima_idx = df_csv[
            (df_csv['model'] == 'ARIMA') &
            (df_csv['feature_group'] == 'all_areas')
        ].index

        if arima_idx.empty:
            print(f"No ARIMA row found in {basename}. Adding new row.")

            new_row = {col: arima_row.get(col, np.nan) for col in metric_cols}
            new_row.update({
                'model': 'ARIMA',
                'feature_group': 'all_areas'
            })

            # Fill in any other missing columns from the original file
            for col in column_order:
                if col not in new_row:
                    new_row[col] = np.nan

            new_row_df = pd.DataFrame([new_row])[column_order]
            df_csv = pd.concat([df_csv, new_row_df], ignore_index=True)

        else:
            print(f"Updating existing ARIMA row in {basename}.")
            for col in metric_cols:
                df_csv.loc[arima_idx, col] = arima_row[col]

        # Reorder rows by desired model order
        df_csv['model'] = pd.Categorical(df_csv['model'], categories=model_order, ordered=True)
        df_csv = df_csv.sort_values('model').reset_index(drop=True)

        # Ensure column order is preserved
        df_csv = df_csv[column_order]

        df_csv.to_csv(file, index=False)
        print(f"Saved updated file: {basename}\n")


In [29]:
update_arima_rows('../notebooks/results/opt_num_areas', 'pearson', data)

Found 39 files matching pattern 'pearson*.csv'
Processing pearson_10_20250516_015540.csv with index 10
No ARIMA row found in pearson_10_20250516_015540.csv. Adding new row.
Saved updated file: pearson_10_20250516_015540.csv

Processing pearson_11_20250516_020748.csv with index 11
No ARIMA row found in pearson_11_20250516_020748.csv. Adding new row.
Saved updated file: pearson_11_20250516_020748.csv

Processing pearson_12_20250516_022027.csv with index 12
No ARIMA row found in pearson_12_20250516_022027.csv. Adding new row.
Saved updated file: pearson_12_20250516_022027.csv

Processing pearson_13_20250516_023244.csv with index 13
No ARIMA row found in pearson_13_20250516_023244.csv. Adding new row.
Saved updated file: pearson_13_20250516_023244.csv

Processing pearson_14_20250516_024509.csv with index 14
No ARIMA row found in pearson_14_20250516_024509.csv. Adding new row.
Saved updated file: pearson_14_20250516_024509.csv

Processing pearson_15_20250516_025738.csv with index 15
No ARIM