# IREM Data Conversion with the `radem` library


## 1. Download IREM data (optional)

In this section, we will guide you through the process of downloading IREM data (if you haven't downloaded it already).

- This cell supports Linux-based operating systems, for other OSes you can download the data manually.
- We will use the `wget` command to fetch the data from the official [IREM data repository](http://srem.psi.ch/datarepo/V0/irem/).
- The data will be organized into directories for original raw data, and extracted CDF data. You can modify the `DATA_DIR` variable to change the location of the data.
- Additionally, we will ensure that existing files are not downloaded again to save time and bandwidth.
- Running this cell the first time can take a while, grab a coffee.

In [1]:
%%sh
# Data directory to store the data
DATA_DIR="../data/irem"

# Creating data directories
mkdir -p ${DATA_DIR}
mkdir -p ${DATA_DIR}/extracted
mkdir -p ${DATA_DIR}/hdf5

# Create a symlink to the raw directory
DATA_RAW_DIR=${DATA_DIR}/raw
if [ ! -L "$DATA_RAW_DIR" ]; then
    ABS_DATA_RAW_DIR=$(readlink -f ${DATA_RAW_DIR})
    ABS_DATA_DIR=$(readlink -f ${DATA_DIR})
    ln -s ${ABS_DATA_DIR}/srem.psi.ch/datarepo/V0/irem ${ABS_DATA_RAW_DIR}
fi

# Get data recursively, don't download existing files
wget \
    --recursive \
    --no-parent \
    --continue \
    --no-clobber \
    --no-verbose \
    -A gz \
    http://srem.psi.ch/datarepo/V0/irem/ \
    -P ${DATA_DIR} \
    2> ${DATA_DIR}/wget.log # Redirect wget output to a log file to avoid cluttering the notebook

# Remove summary plots dir which we don't care about
rm -rf ${DATA_DIR}/irem/raw/summaryplots

## 2. Notebook setup

In [1]:
import radem
import os
from pathlib import Path
from typing import List
import gzip
from datetime import date

DATA_DIR = Path("../data/irem")
DATA_RAW_DIR = DATA_DIR / "raw"
DATA_EXTRACTED_DIR = DATA_DIR / "extracted"
DATA_HDF5_DIR = DATA_DIR / "hdf5"

## 2. Extract CDF data (optional)

In [11]:
def get_data_raw_filenames(data_raw_dir: Path) -> List[Path]:
    filenames = [data_raw_dir / dirname / filename
                    for dirname in os.listdir(data_raw_dir)
                    for filename in os.listdir(data_raw_dir / dirname)
                    if filename.endswith(".cdf.gz")]
    filenames_sorted = sorted(filenames)
    return filenames_sorted

def extract_data_raw_file(input_filename: Path, output_filename: Path) -> None:
    with open(input_filename, 'rb') as f_in:
        with gzip.open(f_in) as f_decompressed, open(output_filename, 'wb') as f_out:
            f_out.write(f_decompressed.read())

def extract_data_raw_files(data_raw_filenames: List[Path], data_extracted_dir: Path) -> None:
    for filename in data_raw_filenames:
        output_filename = data_extracted_dir / filename.stem
        print(f"Extracting {filename} to {output_filename}")
        if output_filename.exists():
            print(f"Overriding {filename} - already exists.")
        extract_data_raw_file(filename, output_filename)

extract_data_raw_files(
    get_data_raw_filenames(DATA_RAW_DIR),
    DATA_EXTRACTED_DIR)

Extracting ../data/irem/raw/2002/IREM_PACC_20021017.cdf.gz to ../data/irem/extracted/IREM_PACC_20021017.cdf
Overriding ../data/irem/raw/2002/IREM_PACC_20021017.cdf.gz - already exists.
Extracting ../data/irem/raw/2002/IREM_PACC_20021018.cdf.gz to ../data/irem/extracted/IREM_PACC_20021018.cdf
Overriding ../data/irem/raw/2002/IREM_PACC_20021018.cdf.gz - already exists.
Extracting ../data/irem/raw/2002/IREM_PACC_20021019.cdf.gz to ../data/irem/extracted/IREM_PACC_20021019.cdf
Overriding ../data/irem/raw/2002/IREM_PACC_20021019.cdf.gz - already exists.
Extracting ../data/irem/raw/2002/IREM_PACC_20021020.cdf.gz to ../data/irem/extracted/IREM_PACC_20021020.cdf
Overriding ../data/irem/raw/2002/IREM_PACC_20021020.cdf.gz - already exists.
Extracting ../data/irem/raw/2002/IREM_PACC_20021021.cdf.gz to ../data/irem/extracted/IREM_PACC_20021021.cdf
Overriding ../data/irem/raw/2002/IREM_PACC_20021021.cdf.gz - already exists.
Extracting ../data/irem/raw/2002/IREM_PACC_20021022.cdf.gz to ../data/irem/

## 3. Loading CDFs

In [2]:
science_cdfs = radem.loaders.irem.read_irem_cdfs(DATA_EXTRACTED_DIR)

In [76]:
import pandas as pd
for i, cdf in enumerate(science_cdfs):
    times = cdf["EPOCH"][...]
    df = pd.DataFrame({
        "time": cdf["EPOCH"][...],
        "d1_channel1":  cdf["COUNTRATE"][..., 0],
        "d1_channel2":  cdf["COUNTRATE"][..., 1],
        "d1_channel3":  cdf["COUNTRATE"][..., 2],
        "d1_channel4":  cdf["COUNTRATE"][..., 3],
        "d1_channel5":  cdf["COUNTRATE"][..., 4],
        "d2_channel1":  cdf["COUNTRATE"][..., 5],
        "d2_channel2":  cdf["COUNTRATE"][..., 6],
        "coincidence_channel1":   cdf["COUNTRATE"][..., 7],
        "coincidence_channel2":   cdf["COUNTRATE"][..., 8],
        "coincidence_channel3":   cdf["COUNTRATE"][..., 9],
        "coincidence_channel4":   cdf["COUNTRATE"][..., 10],
        "d3_channel1":  cdf["COUNTRATE"][..., 11],
        "d3_channel2":  cdf["COUNTRATE"][..., 12],
        "d3_channel3":  cdf["COUNTRATE"][..., 13],
        "d3_channel4":  cdf["COUNTRATE"][..., 14],
    })
    df.drop_duplicates(inplace=True)
    df.sort_values('time', inplace=True)
    print(df)
    
    break

                       time  d1_channel1  d1_channel2  d1_channel3  \
0   2002-10-17 19:28:16.205     1.785268     1.235955     0.574282   
1   2002-10-17 19:29:36.334     1.550000     1.050000     0.550000   
2   2002-10-17 19:30:56.563     1.569116     1.145704     0.485679   
3   2002-10-17 19:32:16.791     1.300016     0.975012     0.425005   
4   2002-10-17 19:33:36.970     1.608499     1.197022     0.523697   
..                      ...          ...          ...          ...   
135 2002-10-17 23:50:57.330     1.679135     1.263508     0.623441   
136 2002-10-17 23:52:57.483     1.548709     1.015820     0.541216   
137 2002-10-17 23:54:57.583     1.273938     0.965862     0.424646   
138 2002-10-17 23:56:57.736     1.364393     1.006656     0.432612   
139 2002-10-17 23:58:57.885     1.573702     1.174031     0.566200   

     d1_channel4  d1_channel5  d2_channel1  d2_channel2  coincidence_channel1  \
0       0.149813     0.099875     2.896380     0.024969              0.062422 

In [77]:
import pandas as pd
import os

def dupa():
    # Define the HDF5 file path
    hdf5_file_path = DATA_HDF5_DIR / "dupa.h5"
    
    # Check if the file exists and remove it
    if hdf5_file_path.exists():
        os.remove(hdf5_file_path)
    
    science_cdfs = radem.loaders.irem.read_irem_cdfs(
        DATA_EXTRACTED_DIR, 
        date(2000, 1, 1), 
        date(2025, 12, 31))

    print(len(science_cdfs))
    for i, cdf in enumerate(science_cdfs):
        print(i)
        df = pd.DataFrame({
            "time": cdf["EPOCH"][...],
            "d1_channel1":  cdf["COUNTRATE"][..., 0],
            "d1_channel2":  cdf["COUNTRATE"][..., 1],
            "d1_channel3":  cdf["COUNTRATE"][..., 2],
            "d1_channel4":  cdf["COUNTRATE"][..., 3],
            "d1_channel5":  cdf["COUNTRATE"][..., 4],
            "d2_channel1":  cdf["COUNTRATE"][..., 5],
            "d2_channel2":  cdf["COUNTRATE"][..., 6],
            "coincidence_channel1":   cdf["COUNTRATE"][..., 7],
            "coincidence_channel2":   cdf["COUNTRATE"][..., 8],
            "coincidence_channel3":   cdf["COUNTRATE"][..., 9],
            "coincidence_channel4":   cdf["COUNTRATE"][..., 10],
            "d3_channel1":  cdf["COUNTRATE"][..., 11],
            "d3_channel2":  cdf["COUNTRATE"][..., 12],
            "d3_channel3":  cdf["COUNTRATE"][..., 13],
            "d3_channel4":  cdf["COUNTRATE"][..., 14],
        })
        df.drop_duplicates(inplace=True)
        df.sort_values('time', inplace=True)
        df.set_index('time', inplace=True)
        df.to_hdf(hdf5_file_path, 
                  key='data',
                  append=True, 
                  mode='a', 
                  format='table')

    return df

dupa()

7863
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
2

Unnamed: 0_level_0,d1_channel1,d1_channel2,d1_channel3,d1_channel4,d1_channel5,d2_channel1,d2_channel2,coincidence_channel1,coincidence_channel2,coincidence_channel3,coincidence_channel4,d3_channel1,d3_channel2,d3_channel3,d3_channel4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-12-09 00:00:55.047,1.564060,0.915141,0.565724,0.149750,0.083195,2.029950,0.000000,0.049917,0.016639,0.066556,0.382696,1.331115,0.948419,0.332779,0.099834
2024-12-09 00:01:55.193,1.262458,0.897010,0.531561,0.182724,0.132890,2.009967,0.000000,0.016611,0.066445,0.033223,0.265781,1.328926,0.963471,0.348843,0.066446
2024-12-09 00:02:55.319,1.048253,0.549085,0.216306,0.016639,0.016639,1.863561,0.016639,0.049917,0.000000,0.000000,0.266223,1.331115,1.031614,0.366057,0.149750
2024-12-09 00:03:55.393,1.181364,0.815308,0.432612,0.066556,0.016639,1.930116,0.049917,0.016639,0.049917,0.049917,0.366057,1.364393,0.931780,0.332779,0.216306
2024-12-09 00:04:55.534,1.314476,0.881864,0.449251,0.166389,0.133111,1.930149,0.016639,0.033278,0.049918,0.049918,0.199671,1.397671,1.014975,0.482529,0.149750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-09 17:14:03.163,0.965058,0.648918,0.266223,0.066556,0.033278,1.663894,0.000000,0.016639,0.000000,0.066556,0.332779,1.414309,0.948419,0.316140,0.166389
2024-12-09 17:15:03.297,1.129568,0.797342,0.382060,0.083056,0.033223,1.561462,0.000000,0.016611,0.033223,0.000000,0.149502,1.362126,0.830565,0.332226,0.149502
2024-12-09 17:16:03.450,1.362126,0.913621,0.365449,0.049834,0.016611,1.710963,0.000000,0.000000,0.033223,0.066445,0.315615,1.046512,0.830565,0.348837,0.066445
2024-12-09 17:17:03.582,1.050018,0.733346,0.366673,0.033334,0.016667,1.900000,0.016667,0.083335,0.033334,0.033334,0.216670,1.100000,0.800000,0.233333,0.033333


In [17]:
import pandas as pd
import os

aa = pd.read_hdf(DATA_HDF5_DIR / f"dupa.h5", key='data')

In [18]:
aa

Unnamed: 0_level_0,d1_channel1,d1_channel2,d1_channel3,d1_channel4,d1_channel5,d2_channel1,d2_channel2,coincidence_channel1,coincidence_channel2,coincidence_channel3,coincidence_channel4,d3_channel1,d3_channel2,d3_channel3,d3_channel4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-10-17 19:28:16.205,1.785268,1.235955,0.574282,0.149813,0.099875,2.896380,0.024969,0.062422,0.074906,0.024969,0.499376,1.747815,1.223471,0.599251,0.187266
2002-10-17 19:29:36.334,1.550000,1.050000,0.550000,0.125000,0.100000,2.112500,0.000000,0.050000,0.025000,0.025000,0.337500,1.500000,1.075000,0.512500,0.225000
2002-10-17 19:30:56.563,1.569116,1.145704,0.485679,0.161893,0.099626,2.166901,0.012453,0.024907,0.037360,0.074721,0.311336,1.544209,1.120797,0.535492,0.124533
2002-10-17 19:32:16.791,1.300016,0.975012,0.425005,0.075001,0.037500,2.400060,0.000000,0.025001,0.062502,0.062502,0.337513,1.712500,1.225000,0.462500,0.175000
2002-10-17 19:33:36.970,1.608499,1.197022,0.523697,0.149628,0.149628,2.493766,0.074813,0.062345,0.062345,0.074814,0.374070,1.471322,1.109726,0.299252,0.074813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-09 17:14:03.163,0.965058,0.648918,0.266223,0.066556,0.033278,1.663894,0.000000,0.016639,0.000000,0.066556,0.332779,1.414309,0.948419,0.316140,0.166389
2024-12-09 17:15:03.297,1.129568,0.797342,0.382060,0.083056,0.033223,1.561462,0.000000,0.016611,0.033223,0.000000,0.149502,1.362126,0.830565,0.332226,0.149502
2024-12-09 17:16:03.450,1.362126,0.913621,0.365449,0.049834,0.016611,1.710963,0.000000,0.000000,0.033223,0.066445,0.315615,1.046512,0.830565,0.348837,0.066445
2024-12-09 17:17:03.582,1.050018,0.733346,0.366673,0.033334,0.016667,1.900000,0.016667,0.083335,0.033334,0.033334,0.216670,1.100000,0.800000,0.233333,0.033333


In [5]:
aa.index.value_counts()

time
2024-12-09 17:18:03.695    1
2002-10-17 19:28:16.205    1
2002-10-17 19:29:36.334    1
2002-10-17 19:30:56.563    1
2002-10-17 19:32:16.791    1
                          ..
2002-10-17 19:50:59.044    1
2002-10-17 19:49:38.892    1
2002-10-17 19:48:18.747    1
2002-10-17 19:46:58.639    1
2002-10-17 19:45:38.551    1
Name: count, Length: 10885485, dtype: int64

In [7]:
from_date = date(2024, 1, 1)
to_date = date(2024, 12, 31)

for from_year, to_year in zip(range(2002, 2024), range(2003, 2025)):
    from_date = date(from_year, 1, 1)
    to_date = date(to_year, 1, 1)
    
    df_d1, df_d2, df_d3, df_coincidence = radem.loaders.irem.read_science_cdfs(
        DATA_EXTRACTED_DIR, 
        from_date, 
        to_date)
    
    # radem.loaders.irem.save_hdf5(
    #     DATA_HDF5_DIR / f"irem_d1_d2_d3_coincidence_{from_year}_{to_year}.h5",
    #     df_d1, df_d2, df_d3, df_coincidence
    # )
    
    
    print(f"Loaded data for the period from {from_date} to {to_date}")
    break

Loaded data for the period from 2002-01-01 to 2003-01-01
