In [10]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
cd '/content/drive/MyDrive/TFG-AnalisisResultados'

/content/drive/MyDrive/TFG-AnalisisResultados


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

# LD data stats (across all runs)

In [13]:
df = pd.read_parquet('./LD_SGP_combined_logs.parquet')
summary_records = []

# === 3. Group by acquisition and seed (i.e., per run) ===
grouped = df.groupby(['acquisition', 'seed'])


In [14]:

# For each acquisition + seed/run
for (acq, seed), run_df in grouped:
    run_df = run_df.sort_values('iteration')
    last_iter = run_df['iteration'].max()

    summary_records.append({
        "acquisition": acq,
        "seed": seed,
        "n_iterations": last_iter,  # iteration is 0-indexed
        "final_hv": run_df.loc[run_df['iteration'] == last_iter, 'hypervolume'].values[0],
        "hv_mean": run_df['hypervolume'].mean(),
        "hv_std": run_df['hypervolume'].std(),
        "hv_min": run_df['hypervolume'].min(),
        "hv_max": run_df['hypervolume'].max(),
        "time_mean": run_df['time_sec'].mean(),
        "time_std": run_df['time_sec'].std(),
        "cumulative_time": run_df['cumulative_time_sec'].max()
        # You could also include analysis of candidate_norms here if desired
    })

per_run_summary = pd.DataFrame(summary_records)



In [15]:
per_run_summary

Unnamed: 0,acquisition,seed,n_iterations,final_hv,hv_mean,hv_std,hv_min,hv_max,time_mean,time_std,cumulative_time
0,Sobol,0,19,139.496081,138.578774,0.9711674,137.309983,139.496081,15.257263,0.978898,289.888
1,Sobol,1,17,133.400483,131.507436,4.065773,120.852504,133.400483,14.415294,1.229921,245.06
2,Sobol,2,8,133.773161,132.031679,3.224604,126.795874,133.773161,13.49425,1.728283,107.954
3,Sobol,3,21,134.908709,131.600681,4.376753,124.811543,134.908709,18.252,1.878074,383.292
4,Sobol,4,16,139.540562,137.152746,2.567352,133.942776,139.540562,13.999313,0.890437,223.989
5,qParEGO,0,6,138.747143,138.747143,3.113442e-14,138.747143,138.747143,12.838833,8.0293,77.033
6,qParEGO,1,13,134.166199,133.651981,0.4969097,133.159397,134.166199,19.291846,5.590101,250.794
7,qParEGO,2,10,137.832878,137.677362,0.2866626,137.132738,137.832878,15.0462,6.137377,150.462
8,qParEGO,3,12,136.493243,136.300198,0.2665395,135.786782,136.493243,20.444667,6.644832,245.336
9,qParEGO,4,19,140.819905,140.317694,0.5760663,138.737717,140.819905,18.596474,5.190724,353.333


In [16]:
from scipy.spatial.distance import cdist
import numpy as np

def spacing_metric(pareto):
    pareto_front = np.array(pareto[0][0].tolist(), dtype=float)
    if len(pareto_front) < 2:
        return np.nan  # spacing not defined for 1 or fewer points
    distances = cdist(pareto_front, pareto_front)
    np.fill_diagonal(distances, np.inf)
    nearest_distances = np.min(distances, axis=1)
    mean_d = np.mean(nearest_distances)
    spacing = np.sqrt(np.mean((nearest_distances - mean_d) ** 2))
    return spacing


In [17]:
for df_path in ['./LD_SGP_combined_logs.parquet', './LD_MGP_combined_logs.parquet', './LD_MGPR1_combined_logs.parquet']:
  df = pd.read_parquet(df_path)
  summary_records = []

  # === 3. Group by acquisition and seed (i.e., per run) ===
  grouped = df.groupby(['acquisition', 'seed'])

  # For each acquisition + seed/run
  for (acq, seed), run_df in grouped:
      run_df = run_df.sort_values('iteration')
      last_iter = run_df['iteration'].max()

      # Get Pareto front at final iteration
      final_front = run_df.loc[run_df['iteration'] == last_iter]
      objectives = np.stack(final_front[['pareto_front']].values)  # replace with correct column names

      spacing = spacing_metric(objectives)

      summary_records.append({
          "acquisition": acq,
          "seed": seed,
          "n_iterations": last_iter,  # iteration is 0-indexed,
          "pareto_spacing": spacing,  # <-- New
          "final_hv": run_df.loc[run_df['iteration'] == last_iter, 'hypervolume'].values[0],
          "hv_mean": run_df['hypervolume'].mean(),
          "hv_std": run_df['hypervolume'].std(),
          "hv_min": run_df['hypervolume'].min(),
          "hv_max": run_df['hypervolume'].max(),
          "time_mean": run_df['time_sec'].mean(),
          "time_std": run_df['time_sec'].std(),
          "cumulative_time": run_df['cumulative_time_sec'].max()
          # You could also include analysis of candidate_norms here if desired
      })

  per_run_summary = pd.DataFrame(summary_records)
  # === 4. Aggregate Stats per Acquisition Function ===
  final_summary = per_run_summary.groupby('acquisition').agg({
      'n_iterations': ['mean', 'std', 'min', 'max'],
      'pareto_spacing': ['mean', 'std', 'min', 'max'],  # <-- New aggregation
      'final_hv': ['mean', 'std', 'min', 'max'],
      'hv_mean': ['mean', 'std'],
      'hv_std': ['mean', 'std'],
      'hv_min': ['mean'],
      'hv_max': ['mean'],
      'time_mean': ['mean', 'std'],
      'time_std': ['mean', 'std'],
      'cumulative_time': ['mean', 'std', 'min', 'max']
  }).reset_index()

  # Flatten column names
  final_summary.columns = ['_'.join(col).strip('_') for col in final_summary.columns]

  # === 5. Save summary ===
  parts = df_path.split('_')
  name = '_'.join(parts[:2])
  final_summary.to_csv(name + "_acquisition_summary.csv", index=False)

In [18]:
final_summary

Unnamed: 0,acquisition,n_iterations_mean,n_iterations_std,n_iterations_min,n_iterations_max,pareto_spacing_mean,pareto_spacing_std,pareto_spacing_min,pareto_spacing_max,final_hv_mean,...,hv_min_mean,hv_max_mean,time_mean_mean,time_mean_std,time_std_mean,time_std_std,cumulative_time_mean,cumulative_time_std,cumulative_time_min,cumulative_time_max
0,Sobol,16.2,4.969909,8,21,22.577237,13.319037,14.09397,45.950889,136.223799,...,128.742536,136.223799,15.23104,2.14093,1.321552,0.341913,248.1924,82.169988,108.614,319.772
1,qParEGO,10.4,2.302173,8,14,12.414136,8.456425,0.471404,24.041609,137.233272,...,133.71218,137.233272,19.957684,1.193345,5.810661,1.191422,208.4728,52.953394,160.819,289.459
2,qQEHVI,13.8,4.38178,9,17,20.278483,17.615632,1.000018,39.335098,136.950289,...,133.792692,136.950289,22.084482,3.652946,5.581979,1.314068,297.2634,81.705592,194.14,404.306
3,qQNEHVI,16.0,14.053469,8,41,26.190703,16.101168,8.697943,51.777985,136.866626,...,136.1574,136.866626,18.893887,2.162497,5.125518,0.925731,325.8758,338.838144,142.353,930.353


# LD NEURO data stats (across all runs)

In [19]:
for df_path in ['./LD_NEURO_SGP_combined_logs.parquet', './LD_NEURO_MGP_combined_logs.parquet', './LD_NEURO_MGPR1_combined_logs.parquet']:
  df = pd.read_parquet(df_path)
  summary_records = []

  # === 3. Group by acquisition and seed (i.e., per run) ===
  grouped = df.groupby(['acquisition', 'seed'])

  # For each acquisition + seed/run
  for (acq, seed), run_df in grouped:
      run_df = run_df.sort_values('iteration')
      last_iter = run_df['iteration'].max()

      # Get Pareto front at final iteration
      final_front = run_df.loc[run_df['iteration'] == last_iter]
      objectives = np.stack(final_front[['pareto_front']].values)  # replace with correct column names

      spacing = spacing_metric(objectives)


      summary_records.append({
          "acquisition": acq,
          "seed": seed,
          "n_iterations": last_iter,  # iteration is 0-indexed
          "pareto_spacing": spacing,  # <-- New
          "final_hv": run_df.loc[run_df['iteration'] == last_iter, 'hypervolume'].values[0],
          "hv_mean": run_df['hypervolume'].mean(),
          "hv_std": run_df['hypervolume'].std(),
          "hv_min": run_df['hypervolume'].min(),
          "hv_max": run_df['hypervolume'].max(),
          "time_mean": run_df['time_sec'].mean(),
          "time_std": run_df['time_sec'].std(),
          "cumulative_time": run_df['cumulative_time_sec'].max()
          # You could also include analysis of candidate_norms here if desired
      })

  per_run_summary = pd.DataFrame(summary_records)

  # === 4. Aggregate Stats per Acquisition Function ===
  final_summary = per_run_summary.groupby('acquisition').agg({
      'n_iterations': ['mean', 'std', 'min', 'max'],
      'pareto_spacing': ['mean', 'std', 'min', 'max'],  # <-- New aggregation
      'final_hv': ['mean', 'std', 'min', 'max'],
      'hv_mean': ['mean', 'std'],
      'hv_std': ['mean', 'std'],
      'hv_min': ['mean'],
      'hv_max': ['mean'],
      'time_mean': ['mean', 'std'],
      'time_std': ['mean', 'std'],
      'cumulative_time': ['mean', 'std', 'min', 'max']
  }).reset_index()

  # Flatten column names
  final_summary.columns = ['_'.join(col).strip('_') for col in final_summary.columns]

  # === 5. Save summary ===
  parts = df_path.split('_')
  name = '_'.join(parts[:3]) #IMPORTANT!!!!!!!!!!!!
  final_summary.to_csv(name + "_acquisition_summary.csv", index=False)

# HD data stats (across all runs)

In [20]:
for df_path in ['./HD_SGP_combined_logs.parquet', './HD_MGP_combined_logs.parquet', './HD_MGPR1_combined_logs.parquet']:
  df = pd.read_parquet(df_path)
  summary_records = []

  # === 3. Group by acquisition and seed (i.e., per run) ===
  grouped = df.groupby(['acquisition', 'seed'])

  # For each acquisition + seed/run
  for (acq, seed), run_df in grouped:
      run_df = run_df.sort_values('iteration')
      last_iter = run_df['iteration'].max()

      # Get Pareto front at final iteration
      final_front = run_df.loc[run_df['iteration'] == last_iter]
      objectives = np.stack(final_front[['pareto_front']].values)  # replace with correct column names

      spacing = spacing_metric(objectives)


      summary_records.append({
          "acquisition": acq,
          "seed": seed,
          "n_iterations": last_iter,  # iteration is 0-indexed
          "pareto_spacing": spacing,  # <-- New
          "final_hv": run_df.loc[run_df['iteration'] == last_iter, 'hypervolume'].values[0],
          "hv_mean": run_df['hypervolume'].mean(),
          "hv_std": run_df['hypervolume'].std(),
          "hv_min": run_df['hypervolume'].min(),
          "hv_max": run_df['hypervolume'].max(),
          "time_mean": run_df['time_sec'].mean(),
          "time_std": run_df['time_sec'].std(),
          "cumulative_time": run_df['cumulative_time_sec'].max()
          # You could also include analysis of candidate_norms here if desired
      })

  per_run_summary = pd.DataFrame(summary_records)

  # === 4. Aggregate Stats per Acquisition Function ===
  final_summary = per_run_summary.groupby('acquisition').agg({
      'n_iterations': ['mean', 'std', 'min', 'max'],
      'pareto_spacing': ['mean', 'std', 'min', 'max'],  # <-- New aggregation
      'final_hv': ['mean', 'std', 'min', 'max'],
      'hv_mean': ['mean', 'std'],
      'hv_std': ['mean', 'std'],
      'hv_min': ['mean'],
      'hv_max': ['mean'],
      'time_mean': ['mean', 'std'],
      'time_std': ['mean', 'std'],
      'cumulative_time': ['mean', 'std', 'min', 'max']
  }).reset_index()

  # Flatten column names
  final_summary.columns = ['_'.join(col).strip('_') for col in final_summary.columns]

  # === 5. Save summary ===
  parts = df_path.split('_')
  name = '_'.join(parts[:2])
  final_summary.to_csv(name + "_acquisition_summary.csv", index=False)