# Level 1: Processing raw

In [6]:
import os
from pathlib import Path
import subprocess
import sys
import re
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any

In [7]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

Spacepy `CDF_LIB` check:

In [None]:
if os.environ.get('CDF_LIB', '') == '':
    print('No CDF_LIB environment variable found for CDF file processing.')
from spacepy import pycdf

## Globals

In [None]:
DATA_RAW = Path("../data_raw/")
DATA_EXTRACTED = Path("../data_extracted/")
DATA_CSV = Path("../data_csv/")

## Tools

###  Fetching data
TBI

### Extracting data

In [None]:
def extract_data(source_dir: Path, target_dir: Path) -> bool:
    if 0 != subprocess.call(f"for f in {source_dir}/*.tar.gz; do tar -xvf \"$f\" -C {target_dir}; done;",
                            shell=True):
        print("Error extracting tar files", file=sys.stderr)
        return False
    return True

### Extracted data check

In [None]:
def check_batch_dir(batch_dir: Path) -> bool:
    """
    Check if the batch directory contains all the necessary files.
    """

    # eg for juicepsa-pds4-PI-01-juice_rad-20240417T191059
    #                                      ^-------^
    #                                               ^----^      
    ts0 = batch_dir.name[-15:-7]
    ts1 = batch_dir.name[-6:]

    paths_valid = [
        Path(f"juicepsa-pds4-PI-01-juice_rad-{ts0}T{ts1}-checksum_manifest.tab"),
        Path(f"juicepsa-pds4-PI-01-juice_rad-{ts0}T{ts1}-transfer_manifest.tab"),
        Path(f"juicepsa-pds4-PI-01-juice_rad-{ts0}T{ts1}.xml"),
        Path(f"juice_rad/data_raw/rad_raw_sc_{ts0}.cdf"),
        Path(f"juice_rad/data_raw/rad_raw_sc_{ts0}.lblx"),
    ]

    is_ok = True
    for path in paths_valid:
        if not batch_dir.joinpath(path).exists():
            print(f"Missing {path}", file=sys.stderr)
            is_ok = False
    
    return is_ok

### Reading data

In [None]:
def is_path_science_cdf(path: Path) -> bool:
    return path.name.startswith("rad_raw_sc_") and path.name.endswith(".cdf")

def is_path_housekeeping_cdf(path: Path) -> bool:
    return path.name.startswith("rad_raw_hk_") and path.name.endswith(".cdf")

In [None]:
def read_cdf(cdf_path: Path) -> pd.DataFrame:
    return pycdf.CDF(str(cdf_path))

    # cdf = None
    # with pycdf.CDF(str(cdf_path)) as cdf:
        # cdf = cdf.copy()
    # return cdf

def read_science_cdfs(data_dir: Path) -> List[pycdf.CDF]:
    cdfs = []

    for batch_dir in sorted(data_dir.iterdir()): 
        cdf_dir = batch_dir.joinpath("juice_rad/data_raw") 
        for cdf_path in cdf_dir.glob("*.cdf"):
            if is_path_science_cdf(cdf_path):
                cdfs.append(read_cdf(cdf_path))
    return cdfs

def read_housekeeping_cdfs(data_dir: Path) -> List[pycdf.CDF]:
    cdfs = []

    for batch_dir in sorted(data_dir.iterdir()): 
        cdf_dir = batch_dir.joinpath("juice_rad/data_raw") 
        for cdf_path in cdf_dir.glob("*.cdf"):
            if is_path_housekeeping_cdf(cdf_path):
                cdfs.append(read_cdf(cdf_path))
    return cdfs

### Validating data

In [None]:
def check_science_cdfs(science_cdfs: List[pycdf.CDF]) -> bool:
    # ...
    # ...
    return all([19 == len(cdf.keys()) for cdf in science_cdfs])

def check_housekeeping_cdfs(housekeeping_cdfs: List[pycdf.CDF]) -> bool:
    # ...
    # ...
    return all([55 == cdf.keys() for cdf in housekeeping_cdfs])



### Exploring data

In [None]:
def print_cdf_report(cdf: pycdf.CDF):
    print(f'Keys:')
    print(cdf)

    print(f'\nCDF meta:')
    print(cdf.meta)
    for key, val in cdf.items(): 
        print(f'\n{key} -> {val}')
        print(val.meta)

def hk_cdf_to_raw_df(cdf: pycdf.CDF) -> pd.DataFrame:
    df = pd.DataFrame((cdf[key][...] for key in cdf.keys())).T
    df.columns = cdf.keys()
    return df

### Data conversions
According to *RADEM User Manual*.

In [None]:
def convert_hk_temp(adc_out: int | np.ndarray) -> int:
    """
    Convert housekeeping temperature (ADC output) to Celsius.

    Notes:
    - Uses Equation 6 for RADEM EQM/PFM HK from RADEM User Manual.
    - Applicable for temperature sensors 1-5.
    - 1 Celsius degree precision.
    - RADEM operating range: -40 to +85 Celsius degrees.
    """
    return np.round(adc_out * (3.3 / 4096) * (1000000 / 2210) - 273.16)

## Usage & examples

In [None]:
path = DATA_EXTRACTED.joinpath("juicepsa-pds4-PI-01-juice_rad-20240417T192313/juice_rad/data_raw/rad_raw_hk_20240417.cdf")
cdf = read_cdf(path)
print_cdf_report(cdf)

In [None]:
hk_cdfs = read_housekeeping_cdfs(DATA_EXTRACTED)
science_cdfs = read_science_cdfs(DATA_EXTRACTED)

In [None]:
check_housekeeping_cdfs(hk_cdfs)

In [None]:
check_science_cdfs(science_cdfs)

In [None]:
!tree ../data_extracted

## Temperatures

In [None]:
def fix_df_time(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert the time column to datetime and floor it to seconds, in place.
    """
    df["time"] = pd.to_datetime(df['time']).dt.floor('S')

    return df

def fix_df_time_start(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter the dataframe to only include events after September 1, 2023, in place.
    """
    df.query("time >= '2023-09-01'", inplace=True)

    return df

def fix_df_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find and remove duplicates from the dataframe, in place.
    """
    df.drop_duplicates(inplace=True, keep="first")
    return df

def fix_sorting_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Sort the dataframe by time, in place.
    """
    df.sort_values("time", inplace=True)
    return df

def fix_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix the dataframe in place.
    """
    fix_df_time(df)
    fix_df_time_start(df)
    fix_df_duplicates(df)
    fix_sorting_df(df)
    return df

In [None]:
def verify_df_sorted(df: pd.DataFrame) -> None:
    """
    Verify that the dataframe is sorted by "time"
    """
    # Find rows where the 'time' is decreasing from the previous row
    not_sorted_mask = df['time'].diff().dt.total_seconds() < 0

    # The first row can't be "not sorted" by definition, so we can exclude it from the mask
    not_sorted_mask.iloc[0] = False

    # Filter the DataFrame to find the not sorted rows
    not_sorted_rows = df[not_sorted_mask]

    if not df['time'].is_monotonic_increasing:
        raise ValueError(f"Dataframe is not sorted by time:\n{not_sorted_rows}")

def verify_df_time_diffs(df: pd.DataFrame, 
                         max_diff_tolerance: np.timedelta64 = np.timedelta64(90, 's'), 
                         min_diff_tolerance: np.timedelta64 = np.timedelta64(500, 'ms')) -> None:
    """
    Verify that the time differences between events are within tolerance.
    If time diff >= max_diff_tolerance, just prints the warning (data holes are permitted).
    If time diff <= min_diff_tolerance, raises an exception (possible floating point errors).
    
    Assumes that the dataframe is non-decreasingly sorted by "time".  
    
    There may me multiple groups of events with the same time.
    
    Args:
        df (pd.DataFrame): input dataframe with "time" column
        max_diff_tolerance (np.timedelta64, optional): max time difference tolerance in ms (warning only)
        min_diff_tolerance (np.timedelta64, optional): min time difference tolerance in ms (exception)

    Raises:
        ValueError: when time differences < min_diff_tolerance (possible floating point errors)
    """

    # get all unique "time" values in df
    times = df['time'].unique()

    # calc time diffs
    time_diffs = np.diff(times)

    # check if all time diffs are not larger than the tolerance
    checks = max_diff_tolerance > time_diffs
    if not all(checks):
        # find all indexes of unmet conditions
        indexes = np.where(checks == False)[0]

        # create a dataframe of times
        df_times = pd.DataFrame(times, columns=["time"])

        # find all holes
        holes = [f"{df_times.iloc[i]['time']} and {df_times.iloc[i + 1]['time']}" for i in indexes]
        
        print("Found time holes out of tolerance at times:", *holes, sep='\n\t')


    # check if all time diffs are not smaller than the tolerance
    # (possible floating point errors)
    checks = min_diff_tolerance < time_diffs
    if not all(checks):
        # find all indexes of unmet conditions
        indexes = np.where(checks == False)[0]

        # create a dataframe of times
        df_times = pd.DataFrame(times, columns=["time"])

        # find all too close values
        too_close = [f"{df_times.iloc[i]['time']} and {df_times.iloc[i + 1]['time']}" for i in indexes]
        
        raise ValueError(
            "Found time values too close to each other at times " +
            "(possible floating point errors):\n\t" +
            "\n\t".join(too_close))

In [None]:
cdf = read_cdf(DATA_EXTRACTED.joinpath("juicepsa-pds4-PI-01-juice_rad-20240202T182054/juice_rad/data_raw/rad_raw_hk_20240202.cdf"))

temp_keys = [
    "HK_Temp1_CEU",
    "HK_PandI_Stack_Temp2",
    "HK_E_Stack_Temp3",
    "HK_DD_Temp4",
    "HK_Temp5_CPU",
]

alt_keys = [
    "time",
    "CEU Temperature (1)",
    "P&IDH Temperature (2)",
    "EDH Temperature (3)",
    "DDH Temperature (4)",
    "PCU Temperature (5)",
]

df = pd.DataFrame(
    np.vstack([cdf["TIME_UTC"][...], *[convert_hk_temp(cdf[k][...]) for k in temp_keys]]).T, 
    columns=alt_keys)
df

In [None]:
cdfs = read_housekeeping_cdfs(DATA_EXTRACTED)

dfs = []

for cdf in cdfs:
    df = pd.DataFrame(
        np.vstack([
            cdf["TIME_UTC"][...], 
            *[convert_hk_temp(cdf[k][...]) for k in temp_keys]]).T, 
        columns=alt_keys
    )
    dfs.append(df)

# del cdfs

df = pd.concat(dfs)
print("DF length before fixing:", len(df))
fix_df(df)
print("DF length after fixing:", len(df))

# for row in df.to_dict(orient="records"):
#     print(row)

df

In [None]:
verify_df_sorted(df)
verify_df_time_diffs(df)

In [None]:
df.to_csv(DATA_CSV.joinpath("hk_temp.csv"), index=False)