# Basic data processing


In [3]:
import os
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from enum import Enum
from dataclasses import dataclass
from spacepy import pycdf

## Loading data from ftp server

Uses `FTP_PASSWORD` and `FTP_USER` environment variables, then downloads data from the ftp server. If the password contains special characters remember to escape them with `\`.


In [2]:
!mkdir ../data

mkdir: cannot create directory ‘../data’: File exists


In [1]:
!wget -r --user=$FTP_USER --password=$FTP_PASSWORD ftp://ftptrans.psi.ch/to_radem/ -nd -np -P ../data/

--2024-04-20 00:43:19--  ftp://ftptrans.psi.ch/to_radem/
           => ‘../data/.listing’
Resolving ftptrans.psi.ch (ftptrans.psi.ch)... 192.33.120.71
Connecting to ftptrans.psi.ch (ftptrans.psi.ch)|192.33.120.71|:21... connected.
Logging in as  ... 
The server refuses login.
Retrying.

--2024-04-20 00:43:21--  ftp://ftptrans.psi.ch/to_radem/
  (try: 2) => ‘../data/.listing’
Connecting to ftptrans.psi.ch (ftptrans.psi.ch)|192.33.120.71|:21... connected.
Logging in as  ... 
The server refuses login.
Retrying.

--2024-04-20 00:43:24--  ftp://ftptrans.psi.ch/to_radem/
  (try: 3) => ‘../data/.listing’
Connecting to ftptrans.psi.ch (ftptrans.psi.ch)|192.33.120.71|:21... connected.
Logging in as  ... 
The server refuses login.
Retrying.

--2024-04-20 00:43:27--  ftp://ftptrans.psi.ch/to_radem/
  (try: 4) => ‘../data/.listing’
Connecting to ftptrans.psi.ch (ftptrans.psi.ch)|192.33.120.71|:21... connected.
Logging in as  ... 
The server refuses login.
Retrying.

--2024-04-20 00:43:31--  ftp://

## Extracting `.tar.gz` files


In [4]:
# Extracts all tar files from data/ directory
!for f in ../data/*.tar.gz; do tar -xvf "$f" -C ../data/; done;

juicepsa-pds4-PI-01-juice_rad-20230416T180019/juice_rad/data_raw/rad_raw_sc_20230416.lblx
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juice_rad/data_raw/rad_raw_sc_20230416.cdf
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019-checksum_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019-transfer_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019.xml
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juice_rad/data_raw/rad_raw_sc_20230418.lblx
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juice_rad/data_raw/rad_raw_sc_20230418.cdf
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T213312-checksum_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T213312-transfer_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T21331

In [4]:
# Remove tar.gz files and all non-raw data
!find ../data -maxdepth 1 -type f -delete

In [5]:
for file in os.listdir("../data/"):
    print(os.path.join("../data/", file))

../data/juicepsa-pds4-PI-01-juice_rad-20231019T183131
../data/juicepsa-pds4-PI-01-juice_rad-20231116T151401
../data/juicepsa-pds4-PI-01-juice_rad-20231113T182112
../data/juicepsa-pds4-PI-01-juice_rad-20240228T183603
../data/juicepsa-pds4-PI-01-juice_rad-20231127T184518
../data/juicepsa-pds4-PI-01-juice_rad-20231116T141438
../data/juicepsa-pds4-PI-01-juice_rad-20240117T181317
../data/juicepsa-pds4-PI-01-juice_rad-20231116T140451
../data/juicepsa-pds4-PI-01-juice_rad-20231210T093850
../data/juicepsa-pds4-PI-01-juice_rad-20231116T154523
../data/juicepsa-pds4-PI-01-juice_rad-20230426T084435
../data/juicepsa-pds4-PI-01-juice_rad-20231110T122049
../data/juicepsa-pds4-PI-01-juice_rad-20231026T184446
../data/juicepsa-pds4-PI-01-juice_rad-20231110T123723
../data/juicepsa-pds4-PI-01-juice_rad-20240312T183241
../data/juicepsa-pds4-PI-01-juice_rad-20240116T182711
../data/juicepsa-pds4-PI-01-juice_rad-20230921T182553
../data/juicepsa-pds4-PI-01-juice_rad-20231116T145047
../data/juicepsa-pds4-PI-01-

# Reading raw CDF data


In [6]:
@dataclass
class RawCDF:
    name: str
    date: datetime
    tpe: str  # type
    data: pycdf.CDF

    def count_events(self) -> int:
        total_channels = 31 + 9 + 9  # TODO: Rewrite in terms of cdf Vars
        return total_channels * len(self.data["TIME_UTC"])

In [7]:
def parse_date(filename: str) -> datetime:
    date_string = filename[-12:-4]
    format = '%Y%m%d'
    return datetime.strptime(date_string, format).date()

In [8]:
def parse_type(filename: str) -> str:
    # FixMe: Non exhaustive match
    return 'science' if filename[8:10] == 'sc' else 'housekeeping'

In [9]:
cdfs = [
    RawCDF(name=path.name,
           date=parse_date(path.name),
           tpe=parse_type(path.name),
           data=pycdf.CDF(str(path)))
    for path in pathlib.Path('../data').rglob('*.cdf')
]
science_cdfs = [cdf for cdf in cdfs if cdf.tpe == 'science']

# Experimentation


In [11]:
cdf = science_cdfs[0].data

In [12]:
print(cdf)

DD: CDF_INT8 [1440, 31]
DD_BINS: CDF_BYTE [31] NRV
ELECTRONS: CDF_INT8 [1440, 9]
ELECTRON_BINS: CDF_BYTE [9] NRV
FLUX: CDF_INT8 [1440, 3]
HI_IONS: CDF_INT8 [1440, 8]
HI_ION_BINS: CDF_BYTE [8] NRV
LABEL_DD: CDF_CHAR*9 [31] NRV
LABEL_ELECTRONS: CDF_CHAR*18 [9] NRV
LABEL_FLUX: CDF_CHAR*4 [3] NRV
LABEL_HI_IONS: CDF_CHAR*18 [8] NRV
LABEL_PROTONS: CDF_CHAR*18 [9] NRV
PROTONS: CDF_INT8 [1440, 9]
PROTON_BINS: CDF_BYTE [9] NRV
RADEM_STATUS: CDF_INT2 [1440]
TID: CDF_INT8 [1440]
TIME_OBT: CDF_CHAR*18 [1440]
TIME_RESOLUTION: CDF_UINT2 [1440]
TIME_UTC: CDF_TIME_TT2000 [1440]


In [13]:
def to_dataframe(cdf: pycdf.CDF) -> pd.DataFrame:
    # Read electron channels
    electron_df = pd.concat([
        pd.DataFrame({
            "time": pd.to_datetime(str(time)),
            "event_type": 'e',
            "channel": list(cdf["ELECTRON_BINS"]),
            "value": electrons
        }) for electrons, time in zip(cdf["ELECTRONS"], cdf["TIME_UTC"])
    ])

    # Read proton channels
    proton_df = pd.concat([
        pd.DataFrame({
            "time": pd.to_datetime(str(time)),
            "event_type": 'p',
            "channel": list(cdf["PROTON_BINS"]),
            "value": protons
        }) for protons, time in zip(cdf["PROTONS"], cdf["TIME_UTC"])
    ])

    # Read DD channels
    dd_df = pd.concat([
        pd.DataFrame({
            "time": pd.to_datetime(str(time)),
            "event_type": 'd',
            "channel": list(cdf["DD_BINS"]),
            "value": dd
        }) for dd, time in zip(cdf["DD"], cdf["TIME_UTC"])
    ])

    df = pd.concat([electron_df, proton_df, dd_df])
    df['channel'] = df['channel'].astype("category")
    df['event_type'] = df['event_type'].astype("category")

    return df

## Combining data into a single `DataFrame`


In [17]:
total_events = sum([cdf.count_events() for cdf in science_cdfs])
print(
    f"There are {total_events:,} expected events on all chanells for proton, electron and dd detectors")

There are 21,748,111 expected events on all chanells for proton, electron and dd detectors


In [18]:
df = pd.concat([to_dataframe(cdf.data)
               for i, cdf in enumerate(science_cdfs[:1])])

In [20]:
# Can not be format="fixed" because of cathegorical dtype
df.to_hdf('../data/preprocessed.h5', key='time', format="table")

_With the hdf file already computed you can read it with:_


In [21]:
df = pd.read_hdf('../data/preprocessed.h5')

In [22]:
print(df)

                         time event_type channel  value
0  2023-10-11 00:00:32.787951          e       1      6
1  2023-10-11 00:00:32.787951          e       2     12
2  2023-10-11 00:00:32.787951          e       3     14
3  2023-10-11 00:00:32.787951          e       4     16
4  2023-10-11 00:00:32.787951          e       5     15
..                        ...        ...     ...    ...
26 2023-10-11 23:59:35.181192          d      27      2
27 2023-10-11 23:59:35.181192          d      28      1
28 2023-10-11 23:59:35.181192          d      29      2
29 2023-10-11 23:59:35.181192          d      30      2
30 2023-10-11 23:59:35.181192          d      31      1

[70560 rows x 4 columns]


## Rendering combined data to interactive html


In [None]:
# Separate bins
for channel in range(1, 9):
    for event_type in ['e', 'p']:
        _df = df[df['channel'] == channel]
        _df = _df[_df['event_type'] == event_type]
        fig = px.scatter(_df, x="time", y="value", render_mode='webgl')
        fig.write_html(f"../plots/scatter/{type}_bin{bin}.html")

In [None]:
for type in ['e', 'p']:
    _df = df[df['event_type'] == event_type]
    fig = go.Figure()
    for channel, channel_data in _df.groupby('channel'):
        fig.add_scatter(
            x=channel_data['time'], y=channel_data['value'], name=channel, mode='markers')
    fig.write_html(
        f"../plots/scatter/{type}_combined.html", render_mode='webgl')