# Sobol sample selection for AlphaPEM + Simulations

**Purpose**:  
This script loads the full Sobol-sampled parameter space from `../data/raw/sobol_sampling_configurations_cmpl.pkl`, splits it into thirds,  selects the *middle third* (as assigned to me, Nathaly), and randomly samples `n=4000` configurations  
from it to be evaluated with the AlphaPEM model.

The resulting polarization curves are saved to: `../data/raw/sobol_sample_middle_4000.pkl`

In [20]:
import os
import sys
import pickle
import random
import pandas as pd
import time

In [5]:
sys.path.append(os.path.abspath("../external/AlphaPEM_v1.0/"))

from configuration.settings import current_density_parameters, physical_parameters, computing_parameters, operating_inputs
from modules.display_modules import plot_lambda
from model.AlphaPEM import AlphaPEM

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.sampling.sampler import get_polarisation_curve_samples, build_fixed_parameters, sample_parameters, PARAMETER_RANGES

First, load the full Sobol sample.

In [7]:
sobol_path = '../data/raw/sobol_sampling_configurations_cmpl.pkl'
with open(sobol_path, 'rb') as f:
    sobol_df = pickle.load(f)

print("Variables in Sobol DataFrame:", sobol_df.columns.tolist())
print("Total samples available:", len(sobol_df))

sobol_df.head()

Variables in Sobol DataFrame: ['Tfc', 'Pa_des', 'Sc', 'Phi_c_des', 'epsilon_gdl', 'tau', 'epsilon_mc', 'epsilon_c', 'e', 'Re', 'i0_c_ref', 'kappa_co', 'kappa_c', 'a_slim', 'b_slim', 'a_switch', 'Pc_des']
Total samples available: 38000


Unnamed: 0,Tfc,Pa_des,Sc,Phi_c_des,epsilon_gdl,tau,epsilon_mc,epsilon_c,e,Re,i0_c_ref,kappa_co,kappa_c,a_slim,b_slim,a_switch,Pc_des
0,335.0159,215267.894967,2.784156,0.607013,0.663508,2.960479,0.268524,0.237199,4.0,2e-06,449.366354,18.059504,73.490571,0.043931,0.924738,0.877939,195267.894967
1,335.930971,215267.894967,2.784156,0.607013,0.663508,2.960479,0.268524,0.237199,4.0,2e-06,449.366354,18.059504,73.490571,0.043931,0.924738,0.877939,195267.894967
2,335.0159,164712.670632,2.784156,0.607013,0.663508,2.960479,0.268524,0.237199,4.0,2e-06,449.366354,18.059504,73.490571,0.043931,0.924738,0.877939,144712.670632
3,335.0159,215267.894967,2.837283,0.607013,0.663508,2.960479,0.268524,0.237199,4.0,2e-06,449.366354,18.059504,73.490571,0.043931,0.924738,0.877939,195267.894967
4,335.0159,215267.894967,2.784156,0.34041,0.663508,2.960479,0.268524,0.237199,4.0,2e-06,449.366354,18.059504,73.490571,0.043931,0.924738,0.877939,195267.894967


Now, extract the middle third of the dataset.

In [11]:
total = len(sobol_df)
one_third = total // 3

lower_df = sobol_df.iloc[:one_third].copy()
middle_df = sobol_df.iloc[one_third:2 * one_third].copy()
upper_df = sobol_df.iloc[2 * one_third:].copy()

print("Middle third size:", len(middle_df))

Middle third size: 12666


For this execution, I will only be generating 4000 "samples" from the AlphaPEM simulator.

In [14]:
n = 4000
sampled_df = middle_df.sample(n=n, random_state=42)
remaining_middle_df = middle_df.drop(index=sampled_df.index)
sampled_df = sampled_df.reset_index(drop=True)

For traceability, I will save all components to sobol_sampling_design folder.

In [15]:
save_dir = '../data/raw/sobol_sampling_design'
os.makedirs(save_dir, exist_ok=True)

# Save our division 
lower_df.to_pickle(os.path.join(save_dir, 'sobol_third_lower.pkl'))     # Dejvis
middle_df.to_pickle(os.path.join(save_dir, 'sobol_third_middle.pkl'))   # Nathaly
upper_df.to_pickle(os.path.join(save_dir, 'sobol_third_upper.pkl'))     # Camila

# Save my sampled configs (only 4000 from middle part)
sampled_config_path = os.path.join(save_dir, 'middle_sampled_4000_configs.pkl')
sampled_df.to_pickle(sampled_config_path)

# Save middle-third configs *not yet sampled*
not_sampled_path = os.path.join(save_dir, 'middle_not_sampled_yet_until_07.06.2025_nathaly.pkl')
remaining_middle_df.to_pickle(not_sampled_path)

print(f"Saved to folder: {save_dir}")
print(f"- Sampled 4000 configs: {sampled_config_path}")
print(f"- Remaining middle third (not yet sampled): {not_sampled_path}")

Saved to folder: ../data/raw/sobol_sampling_design
- Sampled 4000 configs: ../data/raw/sobol_sampling_design\middle_sampled_4000_configs.pkl
- Remaining middle third (not yet sampled): ../data/raw/sobol_sampling_design\middle_not_sampled_yet_until_07.06.2025_nathaly.pkl


In [None]:
save_dir_results = '../data/raw/sobol_sampling_design/results'
os.makedirs(save_dir_results, exist_ok=True)

output_path = '../data/raw/sobol_sample_middle_4000.pkl'
results = get_polarisation_curve_samples(
    sampled_parameters=sampled_df.to_dict(orient='records'),
    fixed_parameters=build_fixed_parameters(),
    save_path=output_path
)

print(f"Saved sampled results to: {output_path}")

In [None]:
test_fraction = 0.001
test_n = int(len(sampled_df) * test_fraction)
test_df = sampled_df.sample(n=test_n, random_state=123).reset_index(drop=True)

print(f"\nRunning test with {test_n} configurations (~0.01% of 4000)")




Running test with 4 configurations (~0.01% of 4000)


In [24]:
# Time the simulation
start_time = time.time()

test_output_path = os.path.join(save_dir_results, 'test_sampled_5pct_results.pkl')
test_results = get_polarisation_curve_samples(
    sampled_parameters=test_df.to_dict(orient='records'),
    fixed_parameters=build_fixed_parameters(),
    save_path=test_output_path
)

end_time = time.time()
elapsed_time = end_time - start_time
avg_time = elapsed_time / test_n

print(f"\nTest complete.")
print(f"Total time: {elapsed_time:.2f} seconds")
print(f"Average per simulation: {avg_time:.2f} seconds")
print(f"Results saved to: {test_output_path}")


📁 Final save complete: ../data/raw/sobol_sampling_design/results\test_sampled_5pct_results.pkl with 4 samples.

Test complete.
Total time: 413.60 seconds
Average per simulation: 103.40 seconds
Results saved to: ../data/raw/sobol_sampling_design/results\test_sampled_5pct_results.pkl


In [26]:
with open(test_output_path, 'rb') as f:
    loaded_results = pickle.load(f)

print(f"Loaded results type: {type(loaded_results)}")
if isinstance(loaded_results, list):
    print(f"Number of results: {len(loaded_results)}")
    print(f"Sample keys from one entry: {loaded_results[0].keys() if loaded_results else 'No results'}")

Loaded results type: <class 'pandas.core.frame.DataFrame'>


In [28]:
import multiprocessing
print(f"Number of CPU cores available: {multiprocessing.cpu_count()}")


Number of CPU cores available: 8
