# Notebook 1: Raw data export into HDF5

### Load important modules

In [1]:
# Standard imports
import sys
import numpy as np
import pandas as pd
from numba import jit
import os
import shutil
import tables

# Move to root directory for easier module import
os.chdir("../../")
from notebooks.server.modules.maldi_conversion import process_raw_data
from notebooks.server.modules.lookup_tables import process_lookup_tables

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


<threadpoolctl.threadpool_limits at 0x7f15ba6b2ac0>

### Create a list of raw data filenames

In [2]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/server/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = False
if remove_already_loaded:
    existing_names = [int(name.split("_")[1]) for name in os.listdir(path) if "slice" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Process raw data into numpy arrays with multiprocessing

In [3]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names[3:4])]


Loading files : /data/lipidatlas/data/data_raw/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um/20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um/20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um/20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um/20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_





Loading Sprectra at resolution 1e-05:   0%|          | 0/107712 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/104538 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/112301 [00:00<?, ?it/s]




Loading Sprectra at resolution 1e-05:   0%|          | 0/111108 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 1/74382 [00:00<13:58:23,  1.48it/s]




Loading Sprectra at resolution 1e-05:   0%|          | 236/74382 [00:00<03:00, 410.97it/s]



Loading Sprectra at resolution 1e-05: 100%|██████████| 72345/72345 [00:35<00:00, 2054.78it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 80712/80712 [00:36<00:00, 2188.78it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 74382/74382 [00:38<00:00, 1951.70it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 79180/79180 [00:40<00:00, 1933.44it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 107712/107712 [00:46<00:00, 2304.97it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 104538/104538 [00:51<00:00, 2031.85it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 122550/122550 [00:54<00:00, 2263.47it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 111108/111108 [00:55<00:00, 2015.24it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 119286/119286 [01:02<00:00, 1894.87it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 112301/112301 [01:05<00:00, 1723.29it/s]
Loading Sprectra at resolution 1e-05: 100%|█████

Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting high-res array
Creating and sorting dataframes
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
Creating and sorting dataframes
Loading files : /data/lipidatlas/data/data_raw/20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um/20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um


Loading Sprectra at resolution 1e-05:  75%|███████▌  | 96442/128030 [00:51<00:15, 2030.39it/s]

Creating and sorting dataframes


Loading Sprectra at resolution 1e-05:  79%|███████▉  | 101257/128030 [00:53<00:14, 1891.04it/s]

### Build lookup tables

In [3]:
multiprocessing = False
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        ll_res = pool.map(process_lookup_tables, l_t_names)
else:
    # Normal (single-processed) map
    print(l_t_names[3:4])
    ll_res = map(process_lookup_tables, l_t_names[3:4])



cocuo
[[4, '/data/lipidatlas/data/data_raw/20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um/20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um']]


### Clean temp folder and record everything as a HDF5 file

Record everything in a HDF5 file

In [None]:
output_path = "labe/data/hdf5/slices.hdf5"
input_folder = "notebooks/server/data/temp/"
l_slice_names = [slice_name for slice_name in os.listdir(input_folder)]

# Open the hdf5 file
with tables.open_file(output_path, mode="w") as hdf5_file:
    # Loop over slice files
    for slice_name in l_slice_names:

        # Load slice arrays
        npzfile = np.load(input_folder + slice_name)
        array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
        array_spectra_high_res = npzfile["array_spectra_high_res"]
        array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
        array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
        image_shape = npzfile["image_shape"]
        divider_lookup = npzfile["divider_lookup"]
        lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
        cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
        lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]

        # Create a new group in the hdf5 file
        slice_index = slice_name.split("_")[1]
        group = hdf5_file.create_group("/", slice_index, "Slice " + slice_index)

        # Register all arrays and lookup table in the newly created group
        hdf5_file.create_array(group, "array_spectra", array_spectra_high_res)
        hdf5_file.create_array(group, "array_lookup_pixels", array_pixel_indexes_high_res)
        hdf5_file.create_array(group, "array_avg_intensity", array_averaged_mz_intensity_high_res)
        hdf5_file.create_array(group, "array_avg_intensity_downsampled", array_averaged_mz_intensity_low_res)
        hdf5_file.create_array(group, "image_shape", image_shape)
        hdf5_file.create_array(group, "divider_lookup", divider_lookup)
        hdf5_file.create_array(group, "array_lookup_mz", lookup_table_spectra_high_res)
        hdf5_file.create_array(group, "array_cumulated_lookup_mz_image", cumulated_image_lookup_table_high_res)
        hdf5_file.create_array(group, "array_lookup_mz_avg", lookup_table_averaged_spectrum_high_res)



Clean temporary folder

In [None]:
clean = False
if clean:
    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
