# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import logging

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data, extract_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__', 'nohup.out']


<threadpoolctl.threadpool_limits at 0x7f22ac4a8c10>

### Create a list of raw data filenames

In [2]:
path_brain_1 =  "/data/lipidatlas/data/data_raw/BRAIN1"
path_brain_2 =  "/data/lipidatlas/data/data_raw/BRAIN2/"
path_brain_1_temp = "/data/lipidatlas/data/app/data/temp/brain_1"#"notebooks/data_processing/data/temp/brain_1"
path_brain_2_temp = "/data/lipidatlas/data/app/data/temp/brain_2"#"notebooks/data_processing/data/temp/brain_2"
split_value_1 = "MouseBrainCMC_S"
split_value_2 = "MouseBrain2_S"
ll_t_names = []
for path_brain, path_brain_temp, split_value in zip([path_brain_1, path_brain_2], [path_brain_1_temp, path_brain_2_temp], [split_value_1,split_value_2]):
    # Load filenames
    l_t_names = sorted(
        [
            [
                int(name.split(split_value)[1].split("_")[0].split("A")[0].split("(")[0]),
                path_brain + name + "/" + name,
            ]
            for name in os.listdir(path_brain)
            if "MouseBrain" in name
        ]
    )

    # Correct for duplicates
    for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
        if t_names_2[0] == t_names_1[0]:
            t_names_2.append("bis")
            print("WARNING: duplicate for slice " + str(t_names_1[0]))

    # Remove slices that have already been processed
    os.makedirs(path_brain_temp, exist_ok=True)
    remove_already_loaded = True
    if remove_already_loaded:
        existing_names = [int(name.split("_")[1][:-7]) for name in os.listdir(path_brain_temp) if "raw" in name]
        l_t_names = [x for x in l_t_names if x[0] not in existing_names]

    # Print the final list of names
    for t_names in l_t_names:
        print(t_names[0], t_names[1].split("/")[-1])

    ll_t_names.append(l_t_names)


32 20220130_MouseBrain2_S32_370x325_Att30_25um


In [3]:
brain_1 = False
if brain_1:
    l_t_names = ll_t_names[0]
else:
    l_t_names = ll_t_names[1]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])

32 20220130_MouseBrain2_S32_370x325_Att30_25um



### Extract raw data into numpy arrays with multiprocessing

In [4]:
if True:
    multiprocessing = True
    if multiprocessing:
        with Pool(processes=7) as pool:
            [x for x in pool.imap_unordered(extract_raw_data, l_t_names)]
    else:
        # Normal (single-processed) map
        [x for x in map(extract_raw_data, l_t_names)]

Loading files : /data/lipidatlas/data/data_raw/BRAIN2/20220130_MouseBrain2_S32_370x325_Att30_25um/20220130_MouseBrain2_S32_370x325_Att30_25um


Loading Sprectra at resolution 1e-05: 100%|██████████| 120250/120250 [01:29<00:00, 1339.80it/s]
Loading the m/z values at resolution 1e-05: 100%|██████████| 120250/120250 [01:26<00:00, 1388.73it/s]
100%|██████████| 120250/120250 [23:07<00:00, 86.64it/s]


Creating and sorting dataframes


### Remove slices already processed

In [5]:
path_brain_1_temp = "/data/lipidatlas/data/app/data/temp/brain_1" if brain_1 else "/data/lipidatlas/data/app/data/temp/brain_2"
existing_names = [int(name.split("_")[1][:-4]) for name in os.listdir(path_brain_temp) if 'raw' not in name]
l_t_names = [x for x in l_t_names if x[0] not in existing_names]
print(l_t_names)

[[32, '/data/lipidatlas/data/data_raw/BRAIN2/20220130_MouseBrain2_S32_370x325_Att30_25um/20220130_MouseBrain2_S32_370x325_Att30_25um'], [33, '/data/lipidatlas/data/data_raw/BRAIN2/20220201_MouseBrain2_S33_359x314_Att30_25um/20220201_MouseBrain2_S33_359x314_Att30_25um'], [34, '/data/lipidatlas/data/data_raw/BRAIN2/20220203_MouseBrain2_S34_377x322_Att30_25um/20220203_MouseBrain2_S34_377x322_Att30_25um'], [35, '/data/lipidatlas/data/data_raw/BRAIN2/20220207_MouseBrain2_S35_375x344_Att30_25um/20220207_MouseBrain2_S35_375x344_Att30_25um'], [36, '/data/lipidatlas/data/data_raw/BRAIN2/20220210_MouseBrain2_S36_363x307_Att30_25um/20220210_MouseBrain2_S36_363x307_Att30_25um'], [37, '/data/lipidatlas/data/data_raw/BRAIN2/20220213_MouseBrain2_S37_354x308_Att30_25um/20220213_MouseBrain2_S37_354x308_Att30_25um'], [38, '/data/lipidatlas/data/data_raw/BRAIN2/20220216_MouseBrain2_S38_363x304_Att30_25um/20220216_MouseBrain2_S38_363x304_Att30_25um'], [39, '/data/lipidatlas/data/data_raw/BRAIN2/20220225_M

### Process raw data into numpy arrays with multiprocessing

In [8]:
multiprocessing = False
if multiprocessing:
    with Pool(processes=16) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names[1:])]


Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Prepare data for standardization
Standardize data
Sorting by m/z value for averaging
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Sorting by m/z value for averaging after standardization
Getting spectrums array averaged accross pixels
Double sorting according to pixel and mz high-res array
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/BRAIN2/20220201_MouseBrain2_S33_359x314_Att30_25um/20220201_MouseBrain2_S33_359x314_Att30_25um
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Prepare data for standardization
Standardize data
Sorting by m/z value for averaging
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Sorting by m/z value for averaging after standardization
Gett

### Build lookup tables

In [5]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=16) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  578.98
Shape of lookup_table_spectra_high_res:  (2000, 75888)
Size (in mb) of lookup_table_spectra_high_res:  623.11
Shape of lookup_table_spectra_high_res:  (2000, 81672)
Size (in mb) of lookup_table_spectra_high_res:  637.65
Shape of lookup_table_spectra_high_res:  (2000, 83578)
Size (in mb) of lookup_table_spectra_high_res:  769.65
Shape of lookup_table_spectra_high_res:  (2000, 100879)
Size (in mb) of lookup_table_spectra_high_res:  807.54
Shape of lookup_table_spectra_high_res:  (2000, 105846)
Size (in mb) of lookup_table_spectra_high_res:  886.0
Shape of lookup_table_spectra_high_res:  (2000, 116130)
Size (in mb) of lookup_table_spectra_high_res:  926.49
Shape of lookup_table_spectra_high_res:  (2000, 121437)
Size (in mb) of lookup_table_spectra_high_res:  920.06
Shape of lookup_table_spectra_high_res:  (2000, 120594)
Size (in mb) of lookup_table_spectra_high_res:  986.25
Shape of lookup_table_spectra_high_res:  (2000, 129270)
Size 

FileNotFoundError: [Errno 2] No such file or directory: '/data/lipidatlas/data/app/data/temp/brain_2/slice_32.npz'

### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [None]:
# ! Change all slices indexes of brain 2 such that they can be in the same output folder as brain 1 

In [None]:
output_folder = "data/whole_dataset/"
if brain_1:
    input_folder = "/data/lipidatlas/data/app/data/temp/brain_1"#"notebooks/data_processing/data/temp/brain_1/"
else:
    input_folder = "/data/lipidatlas/data/app/data/temp/brain_2"#"notebooks/data_processing/data/temp/brain_2/"
        
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):
    if 'raw' in slice_name or 'checkpoints' in slice_name:
        continue

    print(slice_name)
    # Extract slice index
    slice_index = int(slice_name.split("_")[1][:-4])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    array_averaged_mz_intensity_high_res_after_standardization = npzfile["array_averaged_mz_intensity_high_res_after_standardization"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]
    array_peaks_corrected = npzfile["array_peaks_corrected"]
    array_corrective_factors = npzfile["array_corrective_factors"]

    # Print array size
    # print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_low_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(cumulated_image_lookup_table_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes / 1024 / 1024, 2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
        "array_peaks_transformed_lipids": array_peaks_corrected,
        "array_corrective_factors": array_corrective_factors,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the
    # pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_after_standardization_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res_after_standardization.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res_after_standardization[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_after_standardization_shape"] = array_averaged_mz_intensity_high_res_after_standardization.shape


    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index][
        "array_cumulated_lookup_mz_image_shape"
    ] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


Clean temporary folder

In [None]:
clean = False
if clean:
    delete_all_files_in_folder(input_folder)
