## Data loading script

This script loads the event and dust counter files inside the volumes issued by the Cassini CDA instrument. It assigns lables and handles missing values given by the .lbl-files of the data base. All volumes are combined into a compact .csv file for later processing.

In [17]:
# Packages and column, label and missing values as defined by .lbl files

import pandas as pd
import re
from pathlib import Path


colspecs_eventlist = [
    (0,10), (11,28), (29,43), (44,52), (53,54),
    (55,63), (64,65), (66,74), (75,76), (77,85),
    (86,87), (88,96), (97,105), (106,114), (115,116),
    (117,124), (125,132), (133,139), (140,147),
    (148,155), (156,164), (165,171), (172,178),
    (179,185), (186,193), (194,201), (202,204),
    (205,208), (209,214), (215,219), (220,228),
    (229,233), (234,242), (243,251), (252,253)
]

colspecs_counter = [
    (0, 17), (18, 26), (27, 35), (36, 44),
    (45, 53), (54, 62), (63, 71), (72, 80),
    (81, 89), (90, 98), (99, 107), (108, 116),
    (117, 125), (126, 134), (135, 143), (144, 152), 
    (153, 161), (162, 170), (171, 179), (180, 188),
    (189, 197),
]


colnames_eventlist = [
    "EVENT_ID",
    "EVENT_TIME",
    "EVENT_JULIAN_DATE",
    "QP_AMPLITUDE",
    "QP_SIGNAL_FLAG",
    "QI_AMPLITUDE",
    "QI_SIGNAL_FLAG",
    "QT_AMPLITUDE",
    "QT_SIGNAL_FLAG",
    "QC_AMPLITUDE",
    "QC_SIGNAL_FLAG",
    "QI_RISE_TIME",
    "QT_RISE_TIME",
    "QC_RISE_TIME",
    "TARGET_FLAG",
    "SPACECRAFT_RA",
    "SPACECRAFT_DEC",
    "SPACECRAFT_SUN_DISTANCE",
    "SPACECRAFT_SIII_LONG",
    "SPACECRAFT_SIII_LAT",
    "SPACECRAFT_SAT_DIST",
    "SC_X_VEL",
    "SC_Y_VEL",
    "SC_Z_VEL",
    "DETECTOR_RA",
    "DETECTOR_DEC",
    "COUNTER_NUMBER",
    "EVENT_QUALITY",
    "PARTICLE_SPEED",
    "PARTICLE_SPEED_FACTOR",
    "PARTICLE_MASS",
    "PARTICLE_MASS_FACTOR",
    "PARTICLE_CHARGE",
    "PARTICLE_CHARGE_ERROR",
    "SPECTRUM_FLAG"
]

colnames_counter = [
    "TIME",
    "COUNTER_0",
    "COUNTER_1",
    "COUNTER_2",
    "COUNTER_3",
    "COUNTER_4",
    "COUNTER_5",
    "COUNTER_6",
    "COUNTER_7",
    "COUNTER_8",
    "COUNTER_9",
    "COUNTER_10",
    "COUNTER_11",
    "COUNTER_12",
    "COUNTER_13",
    "COUNTER_14",
    "COUNTER_15",
    "COUNTER_16",
    "COUNTER_17",
    "COUNTER_18",
    "COUNTER_19",
]


missing_values_eventlist = {
    "EVENT_ID": -999999999,
    "EVENT_TIME": "9999-999T99:99:99",
    "EVENT_JULIAN_DATE": -999999.999999,
    "QP_AMPLITUDE": -9.9E-99,
    "QI_AMPLITUDE": -9.9E-99,
    "QT_AMPLITUDE": -9.9E-99,
    "QC_AMPLITUDE": -9.9E-99,
    "QI_RISE_TIME": -9.9E-99,
    "QT_RISE_TIME": -9.9E-99,
    "QC_RISE_TIME": -9.9E-99,
    "SPACECRAFT_RA": -999.99,
    "SPACECRAFT_DEC": -999.99,
    "SPACECRAFT_SUN_DISTANCE": 9.9999,
    "SPACECRAFT_SIII_LONG": -999.99,
    "SPACECRAFT_SIII_LAT": -999.99,
    "SPACECRAFT_SAT_DIST": -9999.99,
    "SC_X_VEL": -99.99,
    "SC_Y_VEL": -99.99,
    "SC_Z_VEL": -99.99,
    "DETECTOR_RA": -999.99,
    "DETECTOR_DEC": -999.99,
    "COUNTER_NUMBER": -9,
    "EVENT_QUALITY": -9,
    "PARTICLE_SPEED": -99.9,
    "PARTICLE_SPEED_FACTOR": -9.9,
    "PARTICLE_MASS": -9.9E-99,
    "PARTICLE_MASS_FACTOR": -9.9,
    "PARTICLE_CHARGE": -9.9E-99,
    "PARTICLE_CHARGE_ERROR": -9.9E-99,
    "SPECTRUM_FLAG": 9
}

missing_values_counter = {
    "TIME": None,
    "COUNTER_0": -99,
    "COUNTER_1": -99,
    "COUNTER_2": -99,
    "COUNTER_3": -99,
    "COUNTER_4": -99,
    "COUNTER_5": -99,
    "COUNTER_6": -99,
    "COUNTER_7": -99,
    "COUNTER_8": -99,
    "COUNTER_9": -99,
    "COUNTER_10": -99,
    "COUNTER_11": -99,
    "COUNTER_12": -99,
    "COUNTER_13": -99,
    "COUNTER_14": -99,
    "COUNTER_15": -99,
    "COUNTER_16": -99,
    "COUNTER_17": -99,
    "COUNTER_18": -99,
    "COUNTER_19": -99,
}

In [23]:
all_counter_dfs = []
all_event_dfs = []
all_files = []

# find volumes
root = Path("../Dataset")
vol_pattern = re.compile(r"COCDA_\d{4}")
volumes = [d for d in root.iterdir() if d.is_dir() and vol_pattern.fullmatch(d.name)]

for vol in volumes:
    #find counter and event files
    counter_files = sorted(vol.glob("DATA/**/CDACOUNTER*.TAB"))
    event_files = sorted(vol.glob("DATA/**/CDAEVENTS*.TAB"))
    all_files.extend(counter_files)
    all_files.extend(event_files)

    # load both files in two seperate data frames
    for f in counter_files:
        df = pd.read_fwf(
            f, colspecs=colspecs_counter, names=colnames_counter,
            na_values=missing_values_counter
        )
        df["TIME"] = pd.to_datetime(df["TIME"], errors="coerce", format="%Y-%jT%H:%M:%S")
        df["SOURCE_VOLUME"] = vol.name
        df["SOURCE_FILE"] = f.name
        all_counter_dfs.append(df)

    for f in event_files:
        df = pd.read_fwf(
            f, colspecs=colspecs_eventlist, names=colnames_eventlist,
            na_values=missing_values_eventlist
        )
        df["EVENT_TIME"] = pd.to_datetime(df["EVENT_TIME"], errors="coerce", format="%Y-%jT%H:%M:%S")
        df["SOURCE_VOLUME"] = vol.name
        df["SOURCE_FILE"] = f.name
        all_event_dfs.append(df)

if not all_counter_dfs and not all_event_dfs:
    raise RuntimeError("No tables were loaded. Check paths or volume names.")

# Concat all counter and event dataframes into one
counter_df = pd.concat(all_counter_dfs, ignore_index=True)
event_df = pd.concat(all_event_dfs, ignore_index=True)

# Drop rows with NaN time and sort
counter_df = counter_df.dropna(subset=["TIME"]).sort_values("TIME")
event_df = event_df.dropna(subset=["EVENT_TIME"]).sort_values("EVENT_TIME")

# Build master timeline
all_times = pd.Index(sorted(set(counter_df["TIME"].unique()) | set(event_df["EVENT_TIME"].unique())))
master = pd.DataFrame({'TIME': all_times})

# Merge counter features into master
master = pd.merge_asof(master, counter_df, on="TIME", direction="nearest",
                       tolerance=pd.Timedelta("3min"), suffixes=('', '_COUNTER'))

# Merge event features into master
event_df = event_df.rename(columns={"EVENT_TIME": "TIME"})
event_df = event_df.dropna(subset=["TIME"]).sort_values("TIME")
master = pd.merge_asof(master, event_df, on="TIME", direction="nearest",
                       tolerance=pd.Timedelta("3min"), suffixes=('', '_EVENT'))

# Merge source volume and file columns together
if "SOURCE_VOLUME_EVENT" in master.columns:
    master["SOURCE_VOLUME"] = master["SOURCE_VOLUME"].combine_first(master["SOURCE_VOLUME_EVENT"])
    master.drop("SOURCE_VOLUME_EVENT", axis=1, inplace=True)

if "SOURCE_FILE_EVENT" in master.columns:
    master["SOURCE_FILE"] = master["SOURCE_FILE"].combine_first(master["SOURCE_FILE_EVENT"])
    master.drop("SOURCE_FILE_EVENT", axis=1, inplace=True)

print(f"Built merged master table with {len(master):,} time points from {len(all_files)} files.")

Built merged master table with 450,913 time points from 28 files.


In [24]:
# Shorter label names
rename_map = {
    "SPACECRAFT_SUN_DISTANCE": "SC_SUN_DIST",
    "SPACECRAFT_SIII_LONG": "SC_SIII_LONG",
    "SPACECRAFT_SIII_LAT": "SC_SIII_LAT",
    "SPACECRAFT_SAT_DIST": "SC_SAT_DIST"
}
master.rename(columns={k:v for k, v in rename_map.items() if k in master.columns}, inplace=True)

In [25]:
# To check whether the data looks correct
master.head()

Unnamed: 0,TIME,COUNTER_0,COUNTER_1,COUNTER_2,COUNTER_3,COUNTER_4,COUNTER_5,COUNTER_6,COUNTER_7,COUNTER_8,...,DETECTOR_DEC,COUNTER_NUMBER,EVENT_QUALITY,PARTICLE_SPEED,PARTICLE_SPEED_FACTOR,PARTICLE_MASS,PARTICLE_MASS_FACTOR,PARTICLE_CHARGE,PARTICLE_CHARGE_ERROR,SPECTRUM_FLAG
0,2005-01-01 00:00:34,2.0,16.0,12.0,71.0,246.0,186.0,25.0,14.0,0.0,...,59.25,37,,,,,,,,0.0
1,2005-01-01 00:01:38,2.0,16.0,12.0,71.0,246.0,189.0,25.0,14.0,0.0,...,59.25,37,,,,,,,,0.0
2,2005-01-01 00:02:42,2.0,16.0,12.0,71.0,246.0,191.0,25.0,14.0,0.0,...,59.25,37,,,,,,,,0.0
3,2005-01-01 00:03:46,2.0,16.0,12.0,71.0,246.0,192.0,25.0,14.0,0.0,...,59.25,37,,,,,,,,0.0
4,2005-01-01 00:04:50,2.0,16.0,12.0,71.0,246.0,193.0,25.0,14.0,0.0,...,59.23,19,,,,,,,,0.0


In [26]:
master.to_csv("CompiledDataSet/Cassini_CDA_Count+Event_V0.csv", index=True)