# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import logging

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data, extract_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__']


<threadpoolctl.threadpool_limits at 0x7faf46836fa0>

### Create a list of raw data filenames

In [4]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/data_processing/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = True
if remove_already_loaded:
    existing_names = [int(name.split("_")[1][:-7]) for name in os.listdir(path) if "raw" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
17 20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_450x287_Att30_25um
19 20210325_MouseBrainCMC_S19AS4_2Dpixelmode_396x232_Att30_25um
20 20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um
21 20210408_MouseBrainCMC_S21AU4_2Dpixelmode_394x215_Att30_25um
22 20210409_MouseBrainCMC_S22AV1_2Dpixelmode_416x207_Att30_25um
23 20210412_MouseBrainCMC_S23AZ1_2Dpixelmode_360x260_Att30_25um
27 20210603_MouseBrainCMC_S27_duplicate_2Dpixelmode_372x272_Att30_25um
30 20210429_MouseBrainCMC_S30_5_2Dpixelmode_367x278_Att30_25um
31 20210501_MouseBrainCMC_S31_3_2Dpixelmode_355x239_Att30_25um
32 20210504_MouseBrainCMC_S32_3_2Dpixelmode_298x230_Att30_25um


### Extract raw data into numpy arrays with multiprocessing

In [5]:
multiprocessing = True
if multiprocessing:
    with Pool(processes=14) as pool:
        [x for x in pool.imap_unordered(extract_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(extract_raw_data, l_t_names)]

Loading files : /data/lipidatlas/data/data_raw/20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um/20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um/20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um/20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um/20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25umLoading files : /data/lipidatlas/data/data_raw/20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_450x287_Att30_25um/20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_450x287_Att30_25umLoading files : /data/lipidatlas/data/data_raw/20210325_MouseBrainCMC_S19AS4_2Dpixelmode_396x232_Att30_25um/20210325_MouseBrainCMC_S19AS4_2Dpixelmode_396x232_Att30_25

Loading Sprectra at resolution 1e-05:   0%|          | 0/68540 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/102026 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/112301 [00:00<?, ?it/s]



Loading Sprectra at resolution 1e-05:   0%|          | 0/101184 [00:00<?, ?it/s]







Loading Sprectra at resolution 1e-05:   0%|          | 0/93600 [00:00<?, ?it/s]]



Loading Sprectra at resolution 1e-05: 100%|██████████| 84710/84710 [00:51<00:00, 1642.00it/s]]
Loading Sprectra at resolution 1e-05:  43%|████▎     | 43881/101184 [00:55<00:51, 1108.32it/s]t/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 93600/93600 [01:03<00:00, 1485.53it/s]]it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 105336/105336 [01:06<00:00, 1572.43it/s]t/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 104538/104538 [01:07<00:00, 1559.92it/s]
Loading Sprectra at resolution 1e-05: 100%|██████████| 68540/68540 [01:13<00:00, 934.26it/s] ]it/s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 86112/86112 [01:17<00:00, 1108.77it/s]]]s]s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 119286/119286 [01:18<00:00, 1528.63it/s]s]s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 112301/112301 [01:21<00:00, 1375.31it/s]t/s]]
Loading Sprectra at resolution 1e-05: 100%|██████████| 84845/84845 [01:22<00:00, 1025.79it/s]s]t/s]]
Loadi

Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes
Creating and sorting dataframes


### Process raw data into numpy arrays with multiprocessing

In [None]:
multiprocessing = False
if multiprocessing:
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names[:1])]


### Build lookup tables

In [None]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [None]:
output_folder = "data/whole_dataset/"
input_folder = "notebooks/data_processing/data/temp/"
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):

    # Extract slice index
    slice_index = int(slice_name.split("_")[1][:-4])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]
    array_peaks_corrected = npzfile["array_peaks_corrected"]
    array_corrective_factors = npzfile["array_corrective_factors"]

    # Print array size
    # print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_low_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(cumulated_image_lookup_table_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes / 1024 / 1024, 2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
        "array_peaks_transformed_lipids": array_peaks_corrected,
        "array_corrective_factors": array_corrective_factors,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the
    # pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index][
        "array_cumulated_lookup_mz_image_shape"
    ] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


Clean temporary folder

In [None]:
clean = False
if clean:
    delete_all_files_in_folder(input_folder)
