# Basic data processing

In [2]:
import os
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.offline as pyo
import plotly.express as px 
import plotly.graph_objects as go
from datetime import datetime
from enum import Enum
from dataclasses import dataclass
from spacepy import pycdf

This unreleased version of SpacePy is not supported by the SpacePy team.


## Loading data from ftp server
Place your `FTP_PASSWORD` and download data from the ftp server. If the password contains special characters remember to escape them with `\`.

In [8]:
!mkdir ../data

In [9]:
!wget -r --user=ifjagh --password=1\$Welcome1\$ ftp://ftptrans.psi.ch/to_radem/ -nd -np -P ../data/

--2024-02-21 03:17:36--  ftp://ftptrans.psi.ch/to_radem/
           => ‘../data/.listing’
Resolving ftptrans.psi.ch (ftptrans.psi.ch)... 192.33.120.71
Connecting to ftptrans.psi.ch (ftptrans.psi.ch)|192.33.120.71|:21... connected.
Logging in as ifjagh ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /to_radem ... done.
==> PASV ... done.    ==> LIST ... done.

.listing                [ <=>                ]  34.48K  --.-KB/s    in 0.05s   

2024-02-21 03:17:37 (694 KB/s) - ‘../data/.listing’ saved [35310]

Removed ‘../data/.listing’.
--2024-02-21 03:17:37--  ftp://ftptrans.psi.ch/to_radem/Bard%20timeline-review-Oct-2022%20(1).xlsx
           => ‘../data/Bard timeline-review-Oct-2022 (1).xlsx’
==> CWD not required.
==> PASV ... done.    ==> RETR Bard timeline-review-Oct-2022 (1).xlsx ... done.
Length: 21543 (21K)


2024-02-21 03:17:37 (473 KB/s) - ‘../data/Bard timeline-review-Oct-2022 (1).xlsx’ saved [21543]

--2024-02-21 03:17:37--  ftp://ftptran

## Extracting `.tar.gz` files

In [10]:
# Extracts all tar files from data/ directory
!for f in ../data/*.tar.gz; do tar -xvf "$f" -C ../data/; done;

juicepsa-pds4-PI-01-juice_rad-20230416T180019/juice_rad/data_raw/rad_raw_sc_20230416.lblx
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juice_rad/data_raw/rad_raw_sc_20230416.cdf
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019-checksum_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019-transfer_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230416T180019/juicepsa-pds4-PI-01-juice_rad-20230416T180019.xml
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juice_rad/data_raw/rad_raw_sc_20230418.lblx
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juice_rad/data_raw/rad_raw_sc_20230418.cdf
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T213312-checksum_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T213312-transfer_manifest.tab
juicepsa-pds4-PI-01-juice_rad-20230419T213312/juicepsa-pds4-PI-01-juice_rad-20230419T21331

In [11]:
# Remove tar.gz files and all non-raw data
!find ../data -maxdepth 1 -type f -delete

In [2]:
for file in os.listdir("../data/"):
    print(os.path.join("../data/", file))

../data/juicepsa-pds4-PI-01-juice_rad-20230416T180019
../data/juicepsa-pds4-PI-01-juice_rad-20230419T213312
../data/juicepsa-pds4-PI-01-juice_rad-20230419T213444
../data/juicepsa-pds4-PI-01-juice_rad-20230421T000051
../data/juicepsa-pds4-PI-01-juice_rad-20230421T000226
../data/juicepsa-pds4-PI-01-juice_rad-20230425T092621
../data/juicepsa-pds4-PI-01-juice_rad-20230426T084435
../data/juicepsa-pds4-PI-01-juice_rad-20230426T084607
../data/juicepsa-pds4-PI-01-juice_rad-20230707T185430
../data/juicepsa-pds4-PI-01-juice_rad-20230709T185431
../data/juicepsa-pds4-PI-01-juice_rad-20230720T153618
../data/juicepsa-pds4-PI-01-juice_rad-20230901T105022
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144122
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144248
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144414
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144541
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144708
../data/juicepsa-pds4-PI-01-juice_rad-20230912T144834
../data/juicepsa-pds4-PI-01-

# Reading raw CDF data 

In [3]:
@dataclass
class RawCDF:
    name: str
    date: datetime
    tpe: str #type
    data: pycdf.CDF

    def count_events(self) -> int:
        total_channels = 31 + 9 + 9 # TODO: Rewrite in terms of cdf Vars
        return total_channels * len(self.data["TIME_UTC"])

In [4]:
def parse_date(filename: str) -> datetime:
    date_string = filename[-12:-4]
    format = '%Y%m%d'
    return datetime.strptime(date_string, format).date()

In [5]:
def parse_type(filename: str) -> str:
    return 'science' if filename[8:10] == 'sc' else 'housekeeping' # FixMe: Non exhaustive match

In [6]:
cdfs = [
    RawCDF(name=path.name,
           date=parse_date(path.name),
           tpe=parse_type(path.name),
           data=pycdf.CDF(str(path))) 
    for path in pathlib.Path('../data').rglob('*.cdf')
]
science_cdfs = [cdf for cdf in cdfs if cdf.tpe == 'science']

# Experimentation

In [7]:
cdf = science_cdfs[0].data

In [26]:
print(cdf)

DD: CDF_INT8 [540, 31]
DD_BINS: CDF_BYTE [31] NRV
ELECTRONS: CDF_INT8 [540, 9]
ELECTRON_BINS: CDF_BYTE [9] NRV
FLUX: CDF_INT8 [540, 3]
HI_IONS: CDF_INT8 [540, 8]
HI_ION_BINS: CDF_BYTE [8] NRV
LABEL_DD: CDF_CHAR*9 [31] NRV
LABEL_ELECTRONS: CDF_CHAR*18 [9] NRV
LABEL_FLUX: CDF_CHAR*4 [3] NRV
LABEL_HI_IONS: CDF_CHAR*18 [8] NRV
LABEL_PROTONS: CDF_CHAR*18 [9] NRV
PROTONS: CDF_INT8 [540, 9]
PROTON_BINS: CDF_BYTE [9] NRV
RADEM_STATUS: CDF_INT2 [540]
TID: CDF_INT8 [540]
TIME_OBT: CDF_CHAR*18 [540]
TIME_RESOLUTION: CDF_UINT2 [540]
TIME_UTC: CDF_EPOCH [540]


In [11]:
import time

def to_dataframe(cdf: pycdf.CDF) -> pd.DataFrame:
    start = time.time()

    # Read electron channels
    electron_df = pd.concat([
        pd.DataFrame({
            "time": time,
            "event_type": 'e',
            "channel": list(cdf["ELECTRON_BINS"]),
            "value": electrons
        }) for electrons, time in zip(cdf["ELECTRONS"], cdf["TIME_UTC"])
    ])

    # Read proton channels
    proton_df = pd.concat([
        pd.DataFrame({
            "time": time,
            "event_type": 'p',
            "channel": list(cdf["PROTON_BINS"]),
            "value": protons
        }) for protons, time in zip(cdf["PROTONS"], cdf["TIME_UTC"])
    ])
    
    # Read DD channels
    dd_df = pd.concat([
        pd.DataFrame({
            "time": time,
            "event_type": 'd',
            "channel": list(cdf["DD_BINS"]),
            "value": dd
        }) for dd, time in zip(cdf["DD"], cdf["TIME_UTC"])
    ])

    df = pd.concat([electron_df, proton_df, dd_df])
    df['channel'] = df['channel'].astype("category")
    df['event_type'] = df['event_type'].astype("category")

    return df

In [12]:
df = to_dataframe(cdf)

In [13]:
df.dtypes

time          datetime64[us]
event_type          category
channel             category
value                  int64
dtype: object

In [14]:
print(df)

                      time event_type channel  value
0  2023-04-16 14:05:35.501          e       1      1
1  2023-04-16 14:05:35.501          e       2      1
2  2023-04-16 14:05:35.501          e       3      0
3  2023-04-16 14:05:35.501          e       4      0
4  2023-04-16 14:05:35.501          e       5      0
..                     ...        ...     ...    ...
26 2023-04-16 17:40:52.383          d      27      0
27 2023-04-16 17:40:52.383          d      28      0
28 2023-04-16 17:40:52.383          d      29      0
29 2023-04-16 17:40:52.383          d      30      0
30 2023-04-16 17:40:52.383          d      31      0

[26460 rows x 4 columns]


## Combining data into a single `DataFrame`

In [15]:
total_events = sum([cdf.count_events() for cdf in science_cdfs])
print(f"There are {total_events:,} expected events on all chanells for proton, electron and dd detectors")

There are 19,693,737 expected events on all chanells for proton, electron and dd detectors


In [16]:
df = pd.concat([to_dataframe(cdf.data) for i, cdf in enumerate(science_cdfs)])

In [17]:
print(df)

                         time event_type channel  value
0  2023-04-16 14:05:35.501000          e       1      1
1  2023-04-16 14:05:35.501000          e       2      1
2  2023-04-16 14:05:35.501000          e       3      0
3  2023-04-16 14:05:35.501000          e       4      0
4  2023-04-16 14:05:35.501000          e       5      0
..                        ...        ...     ...    ...
26 2024-02-20 17:45:13.852341          d      27      1
27 2024-02-20 17:45:13.852341          d      28      0
28 2024-02-20 17:45:13.852341          d      29      3
29 2024-02-20 17:45:13.852341          d      30      0
30 2024-02-20 17:45:13.852341          d      31      0

[19693737 rows x 4 columns]


In [18]:
df.to_hdf('../data/preprocessed.h5', key='time', format="table") # Can not be format="fixed" because of cathegorical dtype

*With the hdf file already computed you can read it with:*

In [4]:
df = pd.read_hdf('../data/preprocessed.h5')

In [5]:
print(df)

                            time event_type channel  value
0  1970-01-20 11:07:33.935501000          e       1      1
1  1970-01-20 11:07:33.935501000          e       2      1
2  1970-01-20 11:07:33.935501000          e       3      0
3  1970-01-20 11:07:33.935501000          e       4      0
4  1970-01-20 11:07:33.935501000          e       5      0
..                           ...        ...     ...    ...
26 1970-01-20 18:34:11.113852341          d      27      1
27 1970-01-20 18:34:11.113852341          d      28      0
28 1970-01-20 18:34:11.113852341          d      29      3
29 1970-01-20 18:34:11.113852341          d      30      0
30 1970-01-20 18:34:11.113852341          d      31      0

[19693737 rows x 4 columns]


## Rendering combined data to interactive html

In [6]:
# Separate bins
for channel in range(1, 9):
    for event_type in ['e', 'p']:
        _df = df[df['channel']==channel]
        _df = _df[_df['event_type']==event_type]
        fig = px.scatter(_df, x="time", y="value", render_mode='webgl') 
        fig.write_html(f"../plots/scatter/{type}_bin{bin}.html")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version thi

KeyboardInterrupt: 

In [10]:
for type in ['e', 'p']:
    _df = df[df['event_type']==event_type]
    fig = go.Figure()
    for channel, channel_data in _df.groupby('channel'):
        fig.add_scatter(x=channel_data['time'], y=channel_data['value'], name=channel, mode='markers')
    fig.write_html(f"../plots/scatter/{type}_combined.html", render_mode='webgl')





KeyboardInterrupt: 