In [None]:
'''This script loads datasets issued by the Cassini CDA instrument, assigns labels, handles 
missing values, and combines the data from multiple volumes into a single Dataframe.'''

import pandas as pd
from pathlib import Path

# --- column positions and their labels according to the LBL-files ---
colspecs = [
    (0,10), (11,28), (29,43), (44,52), (53,54),
    (55,63), (64,65), (66,74), (75,76), (77,85),
    (86,87), (88,96), (97,105), (106,114), (115,116),
    (117,124), (125,132), (133,139), (140,147),
    (148,155), (156,164), (165,171), (172,178),
    (179,185), (186,193), (194,201), (202,204),
    (205,208), (209,214), (215,219), (220,228),
    (229,233), (234,242), (243,251), (252,253)
]

colnames = [
    "EVENT_ID","EVENT_TIME","EVENT_JULIAN_DATE",
    "QP_AMPLITUDE","QP_SIGNAL_FLAG","QI_AMPLITUDE","QI_SIGNAL_FLAG",
    "QT_AMPLITUDE","QT_SIGNAL_FLAG","QC_AMPLITUDE","QC_SIGNAL_FLAG",
    "QI_RISE_TIME","QT_RISE_TIME","QC_RISE_TIME","TARGET_FLAG",
    "SPACECRAFT_RA","SPACECRAFT_DEC","SPACECRAFT_SUN_DISTANCE",
    "SPACECRAFT_SIII_LONG","SPACECRAFT_SIII_LAT","SPACECRAFT_SAT_DIST",
    "SC_X_VEL","SC_Y_VEL","SC_Z_VEL","DETECTOR_RA","DETECTOR_DEC",
    "COUNTER_NUMBER","EVENT_QUALITY","PARTICLE_SPEED","PARTICLE_SPEED_FACTOR",
    "PARTICLE_MASS","PARTICLE_MASS_FACTOR","PARTICLE_CHARGE","PARTICLE_CHARGE_ERROR",
    "SPECTRUM_FLAG"
]

# --- Missing value conventions ---
missing_values = {
    "EVENT_ID": -999999999,
    "EVENT_TIME": "9999-999T99:99:99",
    "EVENT_JULIAN_DATE": -999999.999999,
    "QP_AMPLITUDE": -9.9E-99,
    "QI_AMPLITUDE": -9.9E-99,
    "QT_AMPLITUDE": -9.9E-99,
    "QC_AMPLITUDE": -9.9E-99,
    "QI_RISE_TIME": -9.9E-99,
    "QT_RISE_TIME": -9.9E-99,
    "QC_RISE_TIME": -9.9E-99,
    "SPACECRAFT_RA": -999.99,
    "SPACECRAFT_DEC": -999.99,
    "SPACECRAFT_SUN_DISTANCE": 9.9999,
    "SPACECRAFT_SIII_LONG": -999.99,
    "SPACECRAFT_SIII_LAT": -999.99,
    "SPACECRAFT_SAT_DIST": -9999.99,
    "SC_X_VEL": -99.99,
    "SC_Y_VEL": -99.99,
    "SC_Z_VEL": -99.99,
    "DETECTOR_RA": -999.99,
    "DETECTOR_DEC": -999.99,
    "COUNTER_NUMBER": -9,
    "EVENT_QUALITY": -9,
    "PARTICLE_SPEED": -99.9,
    "PARTICLE_SPEED_FACTOR": -9.9,
    "PARTICLE_MASS": -9.9E-99,
    "PARTICLE_MASS_FACTOR": -9.9,
    "PARTICLE_CHARGE": -9.9E-99,
    "PARTICLE_CHARGE_ERROR": -9.9E-99,
    "SPECTRUM_FLAG": 9
}

all_dfs = []
all_files = []

# --- volumes to load ---
volumes = [Path("Dataset/COCDA_0011"),
           Path("Dataset/COCDA_0012"),
           Path("Dataset/COCDA_0013")]

for vol in volumes:
    # find the event files
    files = sorted(vol.glob("DATA/**/CDAEVENTS_*.TAB"))
    if not files:
        print(f"[warning] no event file found in {vol}/DATA")
    all_files.extend(files)

    for f in files:
        # read
        df = pd.read_fwf(
            f, colspecs=colspecs, names=colnames,
            na_values=missing_values
        )
        # convert event time (year–doy format -> datetime)
        df["EVENT_TIME"] = pd.to_datetime(
            df["EVENT_TIME"], errors="coerce", format="%Y-%jT%H:%M:%S"
        )
        
        df["SOURCE_VOLUME"] = vol.name
        df["SOURCE_FILE"] = f.name
        all_dfs.append(df)

if not all_dfs:
    raise RuntimeError("No tables were loaded. Check paths or volume names.")

final_df = pd.concat(all_dfs, ignore_index=True)

print(f"Loaded {len(final_df):,} events from {len(all_files)} files "
      f"across volumes {[v.name for v in volumes]}.")

# --- Check the head ---
print(final_df[["EVENT_TIME","EVENT_QUALITY","QP_AMPLITUDE","QI_AMPLITUDE","QT_AMPLITUDE"]].head())


Loaded 89,790 events from 3 files across volumes ['COCDA_0011', 'COCDA_0012', 'COCDA_0013'].
           EVENT_TIME  EVENT_QUALITY  QP_AMPLITUDE  QI_AMPLITUDE  QT_AMPLITUDE
0 2005-04-01 00:31:35            NaN           0.0  3.100000e-15  1.100000e-14
1 2005-04-01 00:32:59            NaN           0.0  1.400000e-14  0.000000e+00
2 2005-04-01 00:45:26            NaN           0.0  0.000000e+00  1.000000e-14
3 2005-04-01 00:46:29            NaN           0.0  6.600000e-14  6.300000e-15
4 2005-04-01 00:48:06            NaN           0.0  0.000000e+00  5.500000e-15
