# Notebook 1
Raw data export into np.arrays stored in a HDF5 file.

### Load important modules

In [2]:
# Standard imports
import numpy as np
import os
import shutil
import pickle

# Move to root directory for easier module import
os.chdir("../../")

from notebooks.server.modules.maldi_conversion import process_raw_data
from notebooks.server.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


<threadpoolctl.threadpool_limits at 0x7f400040d5b0>

### Create a list of raw data filenames

In [3]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/server/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = False
if remove_already_loaded:
    existing_names = [int(name.split("_")[1]) for name in os.listdir(path) if "slice" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Process raw data into numpy arrays with multiprocessing

In [7]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names)]


Loading Sprectra at resolution 1e-05:   0%|          | 0/129826 [00:00<?, ?it/s]

Loading files : /data/lipidatlas/data/data_raw/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um


Loading Sprectra at resolution 1e-05: 100%|██████████| 129826/129826 [01:09<00:00, 1875.42it/s]
Loading the m/z values at resolution 1e-05: 100%|██████████| 129826/129826 [01:19<00:00, 1641.06it/s]


Creating and sorting dataframes
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting high-res array
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um


### Build lookup tables

In [3]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  551.95
Shape of lookup_table_spectra_high_res:  (2000, 72345)
Size (in mb) of lookup_table_spectra_high_res:  604.1
Shape of lookup_table_spectra_high_res:  (2000, 79180)
Size (in mb) of lookup_table_spectra_high_res:  615.78
Shape of lookup_table_spectra_high_res:  (2000, 80712)
Size (in mb) of lookup_table_spectra_high_res:  567.49
Shape of lookup_table_spectra_high_res:  (2000, 74382)
Size (in mb) of lookup_table_spectra_high_res:  821.78
Shape of lookup_table_spectra_high_res:  (2000, 107712)
Size (in mb) of cumulated_image_lookup_table_high_res:  551.95
Shape of cumulated_image_lookup_table_high_res:  (2000, 195, 371)
Size (in mb) of cumulated_image_lookup_table_high_res:  604.1
Shape of cumulated_image_lookup_table_high_res:  (2000, 214, 370)
Size (in mb) of lookup_table_spectra_high_res:  934.98
Shape of lookup_table_spectra_high_res:  (2000, 122550)
Size (in mb) of lookup_table_spectra_high_res:  847.69
Shape of lookup_table_spect

### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [7]:
output_folder = "data/whole_dataset/"
input_folder = "notebooks/server/data/temp/"
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):

    # Extract slice index
    slice_index = int(slice_name.split("_")[1])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]

    # Print array size
    #print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes/ 1024 / 1024,2))
    print(round(array_spectra_high_res.nbytes/ 1024 / 1024,2))
    print(round(array_averaged_mz_intensity_low_res.nbytes/ 1024 / 1024,2))
    print(round(array_averaged_mz_intensity_high_res.nbytes/ 1024 / 1024,2))
    print(round(lookup_table_spectra_high_res.nbytes/ 1024 / 1024,2))
    print(round(cumulated_image_lookup_table_high_res.nbytes/ 1024 / 1024,2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes/ 1024 / 1024,2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_cumulated_lookup_mz_image_shape"] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


0.85
1153.77
0.0
14.49
847.69
847.69
0.01
0.95
1651.25
0.0
17.26
946.78
946.78
0.01
0.8
1119.94
0.0
13.69
797.56
797.56
0.01
0.98
417.77
0.0
3.56
976.79
976.79
0.01
0.89
505.57
0.0
4.47
889.32
889.32
0.01
1.0
627.08
0.0
5.61
1004.73
1004.73
0.01
1.02
1588.25
0.0
14.67
1021.8
1021.8
0.01
0.99
717.17
0.0
6.01
985.34
985.34
0.01
1.05
575.92
0.0
6.53
1052.35
1052.35
0.01
0.7
548.27
0.0
6.06
700.93
700.93
0.01
0.57
780.53
0.0
7.79
567.49
567.49
0.01
0.8
515.07
0.0
4.67
803.65
803.65
0.01
0.65
289.82
0.0
3.41
646.29
646.29
0.01
0.66
1607.22
0.0
14.74
656.98
656.98
0.01
0.71
1080.59
0.0
12.65
714.11
714.11
0.01
0.82
1119.61
0.0
12.02
818.3
818.3
0.01
0.65
919.0
0.0
10.49
650.05
650.05
0.01
0.64
1136.39
0.0
12.69
643.31
643.31
0.01
0.77
1673.98
0.0
16.13
771.97
771.97
0.01
0.73
1088.1
0.0
10.75
726.01
726.01
0.01
0.7
1463.69
0.0
12.37
697.4
697.4
0.01
0.6
879.32
0.0
10.71
604.1
604.1
0.01
0.78
2342.25
0.0
16.69
778.4
778.4
0.01
0.65
1702.48
0.0
14.47
647.32
647.32
0.01
0.52
1343.16
0.0
14.44
5

Clean temporary folder

In [5]:
clean = True
if clean:
    delete_all_files_in_folder(input_folder)


In [9]:
# ! Quick fix for array_avg_low_res (to be deleted later)
from modules.tools.spectra import reduce_resolution_sorted_array_spectra
from modules.maldi_data import MaldiData
output_folder = "data/whole_dataset/"
with open(output_folder + "light_arrays.pickle", "rb") as handle:
        dic_light  = pickle.load(handle)

data = MaldiData()
for i in range(1,33):
        #print(len(dic_light[i]["array_avg_spectrum_downsampled"]))
        array_averaged_mz_intensity_low_res = reduce_resolution_sorted_array_spectra(
        data.get_array_avg_spectrum(slice_index=i), resolution=10 ** -2
    )
        dic_light[i]["array_avg_spectrum_downsampled"] = array_averaged_mz_intensity_low_res

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_light, handle)