# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import logging

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data, extract_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__']


<threadpoolctl.threadpool_limits at 0x7f56902d65e0>

### Create a list of raw data filenames

In [4]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/data_processing/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = False
if remove_already_loaded:
    existing_names = [int(name.split("_")[1][:-4]) for name in os.listdir(path) if "slice" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Extract raw data into numpy arrays with multiprocessing

In [3]:
multiprocessing = True
if multiprocessing:
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(extract_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(extract_raw_data, l_t_names)]

Loading files : /data/lipidatlas/data/data_raw/20210414_MouseBrainCMC_S25_2_2Dpixelmode_358x238_Att30_25um/20210414_MouseBrainCMC_S25_2_2Dpixelmode_358x238_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210419_MouseBrainCMC_S26_3_2Dpixelmode_340x248_Att30_25um/20210419_MouseBrainCMC_S26_3_2Dpixelmode_340x248_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um/20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_450x287_Att30_25um/20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_450x287_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210323_MouseBrainCMC_S18AR4_2Dpixelmode_474x291_Att30_25um/20210323_MouseBrainCMC_S18AR4_2Dpixelmode_474x291_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210413_MouseBrainCMC_S24_3_2Dpixelmode_327x328_Att30_25um/20210413_MouseBrainCMC_S24_3_2Dpixelmode_327x328_Att30_25





Loading Sprectra at resolution 1e-05:   0%|          | 0/84845 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/102026 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/84320 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/101184 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/129150 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/107256 [00:00<?, ?it/s]







Loading Sprectra at resolution 1e-05: 100%|██████████| 85204/85204 [01:00<00:00, 1407.31it/s]]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 95160/95160 [01:03<00:00, 1490.31it/s]s]/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 84320/84320 [01:05<00:00, 1288.48it/s]]t/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 68540/68540 [01:12<00:00, 944.96it/s]s]it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 107256/107256 [01:12<00:00, 1481.46it/s]/s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 91410/91410 [01:16<00:00, 1202.15it/s]1it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 137934/137934 [01:18<00:00, 1765.28it/s]s] ]
Loading Sprectra at resolution 1e-05: 100%|██████████| 84845/84845 [01:24<00:00, 1004.85it/s]40it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 131692/131692 [01:25<00:00, 1537.31it/s]s]s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 129150/129150 [01:26<00:00, 1495.29it/s]it/s]
Loa

Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes


### Process raw data into numpy arrays with multiprocessing

In [6]:
multiprocessing = True
if multiprocessing:
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names)]


Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Compute and normalize pixels values according to TIC
Filtering out noise and matrix 

### Build lookup tables

In [7]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  551.95
Shape of lookup_table_spectra_high_res:  (2000, 72345)
Size (in mb) of lookup_table_spectra_high_res:  615.78
Shape of lookup_table_spectra_high_res:  (2000, 80712)
Size (in mb) of lookup_table_spectra_high_res:  567.49
Shape of lookup_table_spectra_high_res:  (2000, 74382)
Size (in mb) of lookup_table_spectra_high_res:  604.1
Shape of lookup_table_spectra_high_res:  (2000, 79180)
Size (in mb) of lookup_table_spectra_high_res:  797.56
Shape of lookup_table_spectra_high_res:  (2000, 104538)
Size (in mb) of lookup_table_spectra_high_res:  821.78
Shape of lookup_table_spectra_high_res:  (2000, 107712)
Size (in mb) of lookup_table_spectra_high_res:  847.69
Shape of lookup_table_spectra_high_res:  (2000, 111108)
Size (in mb) of lookup_table_spectra_high_res:  934.98
Shape of lookup_table_spectra_high_res:  (2000, 122550)
Size (in mb) of lookup_table_spectra_high_res:  910.08
Shape of lookup_table_spectra_high_res:  (2000, 119286)
Size (

### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [9]:
output_folder = "data/whole_dataset/"
input_folder = "notebooks/data_processing/data/temp/"
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):

    # Extract slice index
    slice_index = int(slice_name.split("_")[1][:-4])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]

    # Print array size
    # print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_low_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(cumulated_image_lookup_table_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes / 1024 / 1024, 2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the 
    # pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index][
        "array_cumulated_lookup_mz_image_shape"
    ] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


0.57
148.1
0.05
0.98
567.49
567.49
0.01
0.6
158.53
0.06
1.1
604.1
604.1
0.01
0.55
71.81
0.05
0.94
551.95
551.95
0.01
0.62
61.66
0.05
1.32
615.78
615.78
0.01
0.82
136.45
0.05
1.13
821.78
821.78
0.01
0.85
273.05
0.06
1.35
847.69
847.69
0.01
0.86
218.57
0.06
1.24
856.79
856.79
0.01
0.8
226.24
0.06
1.21
797.56
797.56
0.01
0.91
210.06
0.06
1.32
910.08
910.08
0.01
0.95
251.87
0.06
1.25
946.78
946.78
0.01
0.93
162.56
0.06
1.2
934.98
934.98
0.01
0.99
270.59
0.06
1.36
990.49
990.49
0.01
0.89
223.25
0.06
1.24
889.32
889.32
0.01
0.98
232.6
0.08
1.8
976.79
976.79
0.01
0.7
144.88
0.05
1.12
700.93
700.93
0.01
0.65
143.6
0.06
1.28
646.29
646.29
0.01
0.8
206.01
0.06
1.24
803.65
803.65
0.01
0.66
133.97
0.06
1.15
656.98
656.98
0.01
1.02
200.49
0.06
1.32
1021.8
1021.8
0.01
0.71
107.58
0.06
1.13
714.11
714.11
0.01
0.52
115.96
0.06
1.12
522.92
522.92
0.01
0.7
150.96
0.06
1.09
697.4
697.4
0.01
0.65
140.57
0.06
1.14
650.05
650.05
0.01
0.65
185.1
0.07
1.33
647.32
647.32
0.01
0.73
134.68
0.05
1.09
726.01
726.0

Clean temporary folder

In [None]:
clean = False
if clean:
    delete_all_files_in_folder(input_folder)
