# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import logging

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data, extract_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__', 'nohup.out']


<threadpoolctl.threadpool_limits at 0x7f8d785763a0>

### Create a list of raw data filenames

In [2]:
path_brain_1 =  "/data/lipidatlas/data/data_raw/BRAIN1/"
path_brain_2 =  "/data/lipidatlas/data/data_raw/BRAIN2/"
path_brain_1_temp = "/data/lipidatlas/data/app/data/temp/brain_1"
path_brain_2_temp = "/data/lipidatlas/data/app/data/temp/brain_2"
split_value_1 = "MouseBrainCMC_S"
split_value_2 = "MouseBrain2_S"
ll_t_names = []
for path_brain, path_brain_temp, split_value in zip([path_brain_1, path_brain_2], [path_brain_1_temp, path_brain_2_temp], [split_value_1,split_value_2]):
    # Load filenames
    l_t_names = sorted(
        [
            [
                int(name.split(split_value)[1].split("_")[0].split("A")[0].split("(")[0]),
                path_brain + name + "/" + name,
            ]
            for name in os.listdir(path_brain)
            if "MouseBrain" in name
        ]
    )

    # Correct for duplicates
    for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
        if t_names_2[0] == t_names_1[0]:
            t_names_2.append("bis")
            print("WARNING: duplicate for slice " + str(t_names_1[0]))

    # Remove slices that have already been processed
    os.makedirs(path_brain_temp, exist_ok=True)
    remove_already_loaded = False
    if remove_already_loaded:
        existing_names = [int(name.split("_")[1][:-7]) for name in os.listdir(path_brain_temp) if "raw" in name]
        l_t_names = [x for x in l_t_names if x[0] not in existing_names]

    # Print the final list of names
    for t_names in l_t_names:
        print(t_names[0], t_names[1].split("/")[-1])

    ll_t_names.append(l_t_names)


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

In [5]:
brain_1 = True
if brain_1:
    l_t_names = ll_t_names[0]
else:
    l_t_names = ll_t_names[1]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])

1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

In [6]:
# Keep only slice 20
l_t_names = l_t_names[19:20]
l_t_names

[[20,
  '/data/lipidatlas/data/data_raw/BRAIN1/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um']]


### Extract raw data into numpy arrays with multiprocessing

In [7]:
if True:
    multiprocessing = False
    if multiprocessing:
        with Pool(processes=7) as pool:
            [x for x in pool.imap_unordered(extract_raw_data, l_t_names)]
    else:
        # Normal (single-processed) map
        [x for x in map(extract_raw_data, l_t_names)]

Loading files : /data/lipidatlas/data/data_raw/BRAIN1/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um






Loading Sprectra at resolution 1e-05: 100%|██████████| 105336/105336 [01:00<00:00, 1745.71it/s]
Loading the m/z values at resolution 1e-05: 100%|██████████| 105336/105336 [00:53<00:00, 1970.79it/s]
100%|██████████| 105336/105336 [20:14<00:00, 86.77it/s]


Creating and sorting dataframes


### Remove slices already processed

In [8]:
if True:
    path_brain_temp = "/data/lipidatlas/data/app/data/temp/brain_1" if brain_1 else "/data/lipidatlas/data/app/data/temp/brain_2"
    existing_names = [int(name.split("_")[1][:-4]) for name in os.listdir(path_brain_temp) if 'raw' not in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]
    # Print the final list of names
    for t_names in l_t_names:
        print(t_names[0], t_names[1].split("/")[-1])

20 20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um


### Process raw data into numpy arrays with multiprocessing

In [9]:
multiprocessing = False
if multiprocessing:
    with Pool(processes=16) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names)]


Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Prepare data for standardization
Standardize data
105204 have been transformed, with an average of  0.0018915630584388427 peaks transformed
Sorting by m/z value for averaging


  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Sorting by m/z value for averaging after standardization
Getting spectrums array averaged accross pixels
Double sorting according to pixel and mz high-res array
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/BRAIN1/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um/20210330_MouseBrainCMC_S20AT3_2Dpixelmode_396x266_Att30_25um


### Build lookup tables

In [10]:
multiprocessing = False
if multiprocessing:
    # Multiprocessing
    with Pool(processes=16) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  803.65
Shape of lookup_table_spectra_high_res:  (2000, 105336)
Size (in mb) of cumulated_image_lookup_table_high_res:  803.65
Shape of cumulated_image_lookup_table_high_res:  (2000, 266, 396)
Size (in mb) of lookup_table_averaged_spectrum_high_res:  0.01
Shape of lookup_table_averaged_spectrum_high_res:  (2000,)
Saving...


### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [11]:
output_folder = "data/whole_dataset/"
#if brain_1:
#    input_folder = "/data/lipidatlas/data/app/data/temp/brain_1/"#"notebooks/data_processing/data/temp/brain_1/"
#else:
#    input_folder = "/data/lipidatlas/data/app/data/temp/brain_2/"#"notebooks/data_processing/data/temp/brain_2/"
        
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over input folders
for brain_1, input_folder in zip([True, False],["/data/lipidatlas/data/app/data/temp/brain_1/", "/data/lipidatlas/data/app/data/temp/brain_2/"]):

    # Loop over slice files
    for slice_name in os.listdir(input_folder):
        if 'raw' in slice_name or 'checkpoints' in slice_name:
            continue

        print(slice_name)
        # Extract slice index
        slice_index = int(slice_name.split("_")[1][:-4])

        # Load slice arrays
        npzfile = np.load(input_folder + slice_name)
        array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
        array_spectra_high_res = npzfile["array_spectra_high_res"]
        array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
        array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
        array_averaged_mz_intensity_high_res_after_standardization = npzfile["array_averaged_mz_intensity_high_res_after_standardization"]
        image_shape = npzfile["image_shape"]
        divider_lookup = npzfile["divider_lookup"]
        lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
        cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
        lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]
        array_peaks_corrected = npzfile["array_peaks_corrected"]
        array_corrective_factors = npzfile["array_corrective_factors"]

        # Print array size
        # print size used by each array in mb
        print(round(array_pixel_indexes_high_res.nbytes / 1024 / 1024, 2))
        print(round(array_spectra_high_res.nbytes / 1024 / 1024, 2))
        print(round(array_averaged_mz_intensity_low_res.nbytes / 1024 / 1024, 2))
        print(round(array_averaged_mz_intensity_high_res.nbytes / 1024 / 1024, 2))
        print(round(lookup_table_spectra_high_res.nbytes / 1024 / 1024, 2))
        print(round(cumulated_image_lookup_table_high_res.nbytes / 1024 / 1024, 2))
        print(round(lookup_table_averaged_spectrum_high_res.nbytes / 1024 / 1024, 2))


        # Update slice index for brain 2
        if not brain_1:
            slice_index += 22

        # Register the lightweights files in a pickled dictionnary
        dic_slices[slice_index] = {
            "image_shape": image_shape,
            "divider_lookup": divider_lookup,
            "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
            "array_lookup_pixels": array_pixel_indexes_high_res,
            "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
            "array_peaks_transformed_lipids": array_peaks_corrected,
            "array_corrective_factors": array_corrective_factors,
        }

        try:
            # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the
            # pickled dictionnary
            fp = np.memmap(
                output_folder + "array_spectra_" + str(slice_index) + ".mmap",
                dtype="float32",
                mode="w+",
                shape=array_spectra_high_res.shape,
            )
            fp[:] = array_spectra_high_res[:]
            fp.flush()
            dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

            fp = np.memmap(
                output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
                dtype="float32",
                mode="w+",
                shape=array_averaged_mz_intensity_high_res.shape,
            )
            fp[:] = array_averaged_mz_intensity_high_res[:]
            fp.flush()
            dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

            fp = np.memmap(
                output_folder + "array_avg_spectrum_after_standardization_" + str(slice_index) + ".mmap",
                dtype="float32",
                mode="w+",
                shape=array_averaged_mz_intensity_high_res_after_standardization.shape,
            )
            fp[:] = array_averaged_mz_intensity_high_res_after_standardization[:]
            fp.flush()
            dic_slices[slice_index]["array_avg_spectrum_after_standardization_shape"] = array_averaged_mz_intensity_high_res_after_standardization.shape


            fp = np.memmap(
                output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
                dtype="int32",
                mode="w+",
                shape=lookup_table_spectra_high_res.shape,
            )
            fp[:] = lookup_table_spectra_high_res[:]
            fp.flush()
            dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

            fp = np.memmap(
                output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
                dtype="float32",
                mode="w+",
                shape=cumulated_image_lookup_table_high_res.shape,
            )
            fp[:] = cumulated_image_lookup_table_high_res[:]
            fp.flush()
            dic_slices[slice_index][
                "array_cumulated_lookup_mz_image_shape"
            ] = cumulated_image_lookup_table_high_res.shape
        except Exception as e:
            print(e)
        

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


slice_1.npz
0.57
148.1
0.05
0.98
567.49
567.49
0.01
slice_3.npz
0.55
71.81
0.05
0.94
551.95
551.95
0.01
slice_4.npz
0.62
61.66
0.05
1.32
615.78
615.78
0.01
slice_5.npz
0.82
136.45
0.05
1.13
821.78
821.78
0.01
slice_2.npz
0.6
158.53
0.06
1.1
604.1
604.1
0.01
slice_8.npz
0.93
162.56
0.06
1.2
934.98
934.98
0.01
slice_12.npz
0.8
226.24
0.06
1.21
797.56
797.56
0.01
slice_6.npz
0.91
210.06
0.06
1.32
910.08
910.08
0.01
slice_10.npz
0.85
273.05
0.06
1.35
847.69
847.69
0.01
slice_13.npz
0.98
232.6
0.08
1.8
976.79
976.79
0.01
slice_11.npz
0.95
251.87
0.06
1.25
946.78
946.78
0.01
slice_7.npz
0.86
218.57
0.06
1.24
856.79
856.79
0.01
slice_14.npz
0.89
223.25
0.06
1.24
889.32
889.32
0.01
slice_21.npz
0.65
143.6
0.06
1.28
646.29
646.29
0.01
slice_19.npz
0.7
144.88
0.05
1.12
700.93
700.93
0.01
slice_16.npz
1.02
200.49
0.06
1.32
1021.8
1021.8
0.01
slice_17.npz
0.99
178.74
0.06
1.3
985.34
985.34
0.01
slice_23.npz
0.71
107.58
0.06
1.13
714.11
714.11
0.01
slice_15.npz
1.0
228.24
0.06
1.27
1004.73
1004.73


Clean temporary folder

In [None]:
clean = False
if clean:
    delete_all_files_in_folder(input_folder)
