# Overview

This notebook shows how to extract the relevant dataset (target position, GVD, TOD, number of accelerated ions between 5 and 20 MeV) from the raw BELLA data, stored at NERSC under `/global/cfs/cdirs/m3239/ip2data/bella_pw_data/`.

This notebook should be run at NERSC, in order to have access to the relevant data.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm
import re
import torch 
from uspas_ml.utils import transformer
import 

# Functions to extract data

In [None]:
def extract_df( scan_number ):
    """
    Extract the dataset (target position, GVD, TOD, number of accelerated ions) from the raw BELLA data, for a given scan.

    Parameters
    ----------
    scan_number : int
        The identification number of the scan to extract.

    Returns
    -------
    pd.DataFrame
        The dataset extracted from the raw data, with one row per shot of the scan.
    """

    # Open s file: records input parameters for each shot of the scan
    date = "03-Mar/24_0328"
    path = "/global/cfs/cdirs/m3239/ip2data/bella_pw_data/PW/Y2024"
    s_file_location = f"{path}/{date}/analysis/s{scan_number}.txt"
    s_file = pd.read_csv(s_file_location, sep='\t')

    # Extract z position
    laser_focus_position = 12 # mm
    if 'IP2-TC-ESP5 Position.Axis 2 Alias:Preplasma Z (MFA)' in s_file:
        z_target = s_file['IP2-TC-ESP5 Position.Axis 2 Alias:Preplasma Z (MFA)']
    else:
        print('Scan %d: z location not found ; reading it in the ECS Live dumps' %scan_number)
        file_path = f"/global/cfs/cdirs/m3239/ip2data/bella_pw_data/PW/Y2024/{date}/ECS Live dumps/Scan{scan_number}.txt"
        device_name = 'IP2-TC-ESP5'
        property_name = 'Position.Axis 2'
        z0 = extract_device_property(file_path, device_name, property_name)
        print('Found: %.3f' %z0)
        z_target = z0 + 0*s_file['DDG-AA-ShotCntrl shot #'] # make array of the same length
    # Motors are in mm. Here we convert to meters.
    z_target = (z_target - laser_focus_position)*1e-3

    # Extract TOD (third-order dispersion)
    if 'DAZZLER-LT-3rdorder value1' in s_file:
        tod = s_file['DAZZLER-LT-3rdorder value1']*(1.e-15)**3 # in fs^3. Here we convert to seconds.
    else: # if not stored, it is 0 by default
        print('Scan %d: TOD not found ; assuming 0' %scan_number)
        tod = 0*s_file['DDG-AA-ShotCntrl shot #'] # make array of the same length

    # Extract GVD
    gvd = s_file['STAGE-1BL-Compression Position.Axis 1 Alias:Compression']

    # Extract total proton number for each shot in the scan
    shot_numbers = s_file['DDG-AA-ShotCntrl shot #'].astype('int')
    n_protons = np.zeros( len(shot_numbers) )
    for i, shot_number in tqdm.tqdm( enumerate(shot_numbers) ):
        spectrum_file = f"{path}/{date}/analysis/superfacility/Scan{str(scan_number).zfill(3)}/24_0328_Scan{str(scan_number).zfill(3)}_{str(shot_number).zfill(3)}.txt"
        n_protons[i] = total_proton_number( spectrum_file )

    return pd.DataFrame( {'z_target (m)': z_target,
                          'TOD (s^3)': tod,
                          'GVD (???)': gvd,
                          'n_protons (1/sr)': n_protons} )

In [None]:
def total_proton_number( spectrum_file, E_min=5, E_max=20 ):
    """
    Compute the total number of protons between E_min and E_max,
    from a data file containing the proton spectrum.

    Parameters
    ----------
    spectrum_file : str
        Path to the spectrum file.
    E_min : float, optional
        Minimum energy, in MeV.
    E_max : float, optional
        Maximum energy, in MeV.

    Returns
    -------
    float
        Total number of protons between E_min and E_max in 1/sr.
    """
    # Read the spectrum file
    spectrum = pd.read_csv(spectrum_file, delim_whitespace=True, skiprows=1,
                           names=['#energy (MeV)', 'number (1/MeV/sr)', 'Grayvalue', 'TP spectrometer'])

    # Compute energy steps
    dE = np.diff(spectrum['#energy (MeV)'].to_numpy())
    dE = np.concatenate( (dE, np.array([0])) ) # make it the same length as other arrays
    spectrum['dE (MeV)'] = dE

    # Integrate the spectrum between E_min and E_max
    selection = (spectrum['#energy (MeV)'] > E_min) & (spectrum['#energy (MeV)'] < E_max)
    n_tot = ( spectrum['dE (MeV)'] * spectrum['number (1/MeV/sr)'] )[selection].sum()

    return n_tot

In [None]:
def extract_device_property(file_path, device_name, property_name):
    """
    Low-level function to extract data from the ECS Live dumps.
    """
    with open(file_path, 'r') as file:
        content = file.read()

    # Regular expression to match the device section
    device_pattern = re.compile(rf'\[Device \d+\]\nDevice Name = "{device_name}"(.*?)\n\n', re.DOTALL)
    device_match = device_pattern.search(content)

    if device_match:
        device_content = device_match.group(1)

        # Regular expression to match the specific property
        property_pattern = re.compile(rf'{property_name} = "(.*?)"')
        property_match = property_pattern.search(device_content)

        if property_match:
            return float(property_match.group(1))
        else:
            return f"Property {property_name} not found in device {device_name}."
    else:
        return f"Device {device_name} not found in the file."

# Data extraction

In [None]:
# Extract all available scans
df_44 = extract_df(44) # GVD scan
df_51 = extract_df(51) # TOD scan
df_61 = extract_df(61) # Z scan
df_62 = extract_df(62) # stability scan at z = 100 um
df_63 = extract_df(63) # stability scan at z = 50 um

In [None]:
# Merge all data into one dataset
df = pd.concat( [ df_44, df_51, df_61, df_62, df_63 ] )

In [None]:
# Save as csv file
df.to_csv('experimental_data.csv')

# Data visualization

In [None]:
plt.clf()
ax = plt.figure().add_subplot(projection='3d')

ax.scatter( (1.e15)**3*df['TOD (s^3)'], 1.e6*df['z_target (m)'], df['n_protons (1/sr)'], c=df['n_protons (1/sr)'])
ax.view_init(elev=40., azim=40, roll=0)
plt.xlabel('TOD')
plt.ylabel('z_target')
#plt.zlabel('number of protons')

In [None]:
plt.plot( df_44['GVD (???)'], df_44['n_protons (1/sr)'], 'o')
plt.title('GVD scan')
plt.xlabel('GVD (???)')
plt.ylabel('Number of protons (1/sr)')

In [None]:
plt.plot( df_61['z_target (m)'], df_61['n_protons (1/sr)'], 'o')
plt.title('Z scan')
plt.xlabel('z_target (m)')
plt.ylabel('Number of protons (1/sr)')

In [None]:
plt.plot( df_51['TOD (s^3)'], df_51['n_protons (1/sr)'], 'o')
plt.title('TOD scan')
plt.xlabel('TOD (s^3)')
plt.ylabel('Number of protons (1/sr)')

<h1>Data Split into Test and Training<h1/>

In [None]:
#Split all data to traing and testing with 0 as the divide (this catches a little bit of the downcurve on the graph)
test_set_1 = df[df['z_target (m)'] < 0]
training_set_1 = df[df['z_target (m)'] >= 0]

#split thee training and test data into x and y
z_training_set_1 = 1.e6*training_set_1['z_target (m)']
TOD_training_set_1 = (1.e15)**3*training_set_1['TOD (s^3)']
protons_training_set_1 = training_set_1['n_protons (1/sr)']

z_test_set_1 = 1.e6*test_set_1['z_target (m)']
TOD_test_set_1 = (1.e15)**3*test_set_1['TOD (s^3)']
protons_test_set_1 = test_set_1['n_protons (1/sr)']

In [None]:
test_set_2 = df[df['z_target (m)'] <= 40e-6]
training_set_2 = df[df['z_target (m)'] > 40e-6]

z_training_set_2 = 1.e6*training_set_2['z_target (m)']
TOD_training_set_2 = (1.e15)**3*training_set_2['TOD (s^3)']
protons_training_set_2 = training_set_2['n_protons (1/sr)']

z_test_set_2 = 1.e6*test_set_2['z_target (m)']
TOD_test_set_2 = (1.e15)**3*test_set_2['TOD (s^3)']
protons_test_set_2 = test_set_2['n_protons (1/sr)']

In [None]:
plt.clf()
ax = plt.figure().add_subplot(projection='3d')

ax.scatter( TOD_training_set_1, z_training_set_1,protons_training_set_1, c='r',alpha=0.3)
ax.scatter( TOD_test_set_1, z_test_set_1,protons_test_set_1, c='b', alpha=0.3)
ax.view_init(elev=40., azim=40, roll=0)
plt.xlabel('TOD')
plt.ylabel('z_target')
#plt.zlabel('number of protons')

<h1>Visualizing Data Split<h1/>

In [None]:
#plot training set 1
plt.scatter(z_training_set_1, protons_training_set_1, label='training data')
plt.scatter(1.e6*df['z_target (m)'],df['n_protons (1/sr)'], s=50, facecolors='none', edgecolors='r', label='Expt data')
plt.title("Z_target Training data set 1")
plt.xlabel('z_target (m)')
plt.ylabel('Number of protons (1/sr)')
plt.legend()
plt.savefig("training_set_1.png")

In [None]:
#plot test set 1
plt.scatter(z_test_set_1, protons_test_set_1, label='test data')
plt.scatter(1.e6*df['z_target (m)'],df['n_protons (1/sr)'], s=50, facecolors='none', edgecolors='r', label='expt data')
plt.title("Z_target Test data set 1")
plt.xlabel('z_target (m)')
plt.ylabel('Number of protons (1/sr)')
plt.legend()
plt.savefig("test_set_1.png")

In [None]:
plt.scatter(z_training_set_2, protons_training_set_2, label='training data')
plt.scatter(1.e6*df['z_target (m)'],df['n_protons (1/sr)'], s=50, facecolors='none', edgecolors='r', label='expt data')
plt.title("Z_target Training data set 2")
plt.xlabel('z_target (m)')
plt.ylabel('Number of protons (1/sr)')
plt.legend()
plt.savefig("training_set_2.png")

In [None]:
plt.scatter(z_test_set_2, protons_test_set_2, label='test data')
plt.scatter(1.e6*df['z_target (m)'],df['n_protons (1/sr)'], s=50, facecolors='none', edgecolors='r', label='expt data')
plt.title("Z_target Test data set 2")
plt.xlabel('z_target (m)')
plt.ylabel('Number of protons (1/sr)')
plt.legend()
plt.savefig("test_set_2.png")

<h1>Saving Split Data to CSV files<h1/>

In [None]:
#save training x and y into csv
training_set_1_df = pd.DataFrame({
    'z_target (m)': z_training_set_1,
    'TOD (s^3)' : TOD_training_set_1,
    'n_protons (1/sr)': protons_training_set_1
})

training_set_1_df.to_csv('training_set_1.csv')

In [None]:
#save test x and y to csv file
test_set_1_df = pd.DataFrame( {
    'z_target (m)': z_test_set_1,
    'TOD (s^3)': TOD_test_set_1,
    'n_protons (1/sr)': protons_test_set_1} )

test_set_1_df.to_csv('test_set_1.csv')

In [None]:
training_set_2_df = pd.DataFrame({
    'z_target (m)': z_training_set_2,
    'TOD (s^3)': TOD_training_set_2,
    'n_protons (1/sr)': protons_training_set_2
})

training_set_2_df.to_csv('training_set_2.csv')

In [None]:
test_set_2_df = pd.DataFrame({
    'z_target (m)': z_test_set_2,
    'TOD (s^3)': TOD_test_set_2,
    'n_protons (1/sr)': protons_test_set_2
})

test_set_2_df.to_csv('test_set_2.csv')