# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import shutil

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir('.'))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__']


<threadpoolctl.threadpool_limits at 0x7f3cbe1d85e0>

### Create a list of raw data filenames

In [26]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/data_processing/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = False
if remove_already_loaded:
    existing_names = [int(name.split("_")[1]) for name in os.listdir(path) if "slice" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Process raw data into numpy arrays with multiprocessing

In [3]:
########################################################################################

In [122]:
t_index_path =  l_t_names[0]
from modules.tools.mspec import SmzMLobj
import pandas as pd
from numba import njit

def load_file(path, resolution=1e-5):
    """This function loads the specified MALDI file from the raw data format (.mzML and .UDP) 
    with the given resolution, and turns it into a scipy sparse matrix. 

    Args:
        path (string): The path of the file to load.
        resolution (float, optional): The resolution of the file to load. Defaults to 1e-5.

    Returns:
        scipy.sparse: A sparse matrix containing the intensity for each m/z value.
    """
    # Load object from SmzMLobj
    smz = SmzMLobj(path + ".mzML", path + ".UDP", mz_resolution=resolution)
    smz.load(load_unique_mz=True)

    # Compute shape of the spectra matrix to preload matrix
    smz.S.shape
    return smz


def process_sparse_matrix(smz, sort=["Pixel", "m/z"], sample=False):
    """This function converts the space matrix into a dataframe sorted according to the 'sort' 
    parameter. It is possible to work only on a tiny subset of the matrix with the 'sample' 
    parameter for debugging purposes. 

    Args:
        smz (scipy.sparse): The sparse matrix obtained from the MALDI imaging.
        sort (list, optional): A list of column names according to which the final dataframe should 
            be sorted. Defaults to ["Pixel", "m/z"].
        sample (bool, optional): A boolean parameter to sample only a subset of the matrix. Defaults 
            to False.

    Returns:
        pandas.Dataframe: A sorted dataframe with three columns: pixels index, m/z, and intensity 
            value.
    """
    # We're going to slice the matrix row by row, so it's faster to convert to csr rather than csc
    S_row = smz.S.tocsr()

    # Turn S into a dict for later conversion into a dataframe
    dic_spectra = {"Pixel": [], "m/z": [], "Intensity": []}
    for i in range(S_row.shape[0]):
        non_zero_indices = S_row[i, :].nonzero()[1]
        dic_spectra["Pixel"].extend([i] * len(non_zero_indices))
        dic_spectra["m/z"].extend(smz.mz_vals[non_zero_indices])
        dic_spectra["Intensity"].extend(S_row[i, non_zero_indices].toarray().flatten())

        if sample and i == 10:
            break

    # Turn dict into a df for easier manipulation
    df = pd.DataFrame.from_dict(dic_spectra)

    # Sort
    df = df.sort_values(by=sort, axis=0)

    # Store image size as metadata
    df.attrs["image_shape"] = smz.img_shape
    return df

@njit
def compute_TIC_per_pixel(array_spectra, n_pixels):
    """This function computes the Total Ion Content (TIC) per pixel of the raw data.

    Args:
        array_spectra (np.ndarray): A numpy array containing spectrum data (pixel index, m/z and 
            intensity).
        n_pixels (int): Number of pixels in the acquisition.

    Returns:
        np.ndarray: A numpy array of len n_pixels containing the TIC for each pixel.
    """
    array_TIC = np.zeros((n_pixels,), dtype = np.float32)
    for i in range(array_spectra.shape[0]):
        pix_idx, mz, intensity = array_spectra[i]
        array_TIC[int(pix_idx)]+=intensity
    return array_TIC

@njit
def normalize_per_TIC_per_pixel(array_spectra, array_TIC):
    """This function normalize each intensity value according to its (TIC), per pixel.

    Args:
        array_spectra (np.ndarray): A numpy array containing spectrum data (pixel index, m/z and 
            intensity).
        array_TIC (np.ndarray): A numpy array of len n_pixels containing the TIC for each pixel.

    Returns:
        np.ndarray: A numpy array containing TIC-normalized spectrum data (pixel index, m/z and 
            intensity).
    """
    for i in range(array_spectra.shape[0]):
        pix_idx, mz, intensity = array_spectra[i]
        array_spectra[i,2]/=array_TIC[int(pix_idx)]
    return array_spectra


def load_peak_file(path):
    """This function loads the peaks annotations (including matrix peaks) from a csv file located 
    at the provided path. It returns a numpy array sorted by min peak value (m/z) annotation.

    Args:
        path (string): The path of the csv file containing the peaks annotations.

    Returns:
        np.ndarray: The sorted dataframe containing the annotations (min peak, max peak, number of 
            pixels containing the current molecule, estimated mz of the current molecule).
    """
    # Load the peaks annotations using the last definition used for the csv file
    path = '/'.join(path.split('/')[:-1]) + "/ranges"    
    df = pd.read_csv(path + ".csv", sep=",")

    # Drop the columns that we won't use afterwards
    df = df.drop(
        [
            "Unnamed: 0",
            "pixel_max_hits",
            "percent_1_hit",
            "concentration",
            "median_intensity",
            "difference",
        ],
        axis=1,
    )

    # Sort by increasing m/z annotation for the peaks
    df = df.sort_values(by="min", axis=0)
    return df.to_numpy()

def load_lipid_file(section_index, path):
    """This function loads a set of specific lipid annotations containing a molecule ID, the average 
    mz for the molecule, the section index and potentially other information, from a csv file 
    located at the provided path. It returns an array of mz values corresponding to the lipids we 
    want to keep for further visualization.

    Args:
        section_index (int): The index of the current acquisition (first slice having index 1).
        path (string): The path of the csv file containing the lipids annotations.

    Returns:
        np.ndarray: A unidimensional array of m/z values corrsponding to the lipids that we want to
            keep for further visualization.
    """
    # Load the peaks annotations using the last definition used for the csv file
    df = pd.read_csv(path, sep=",")

    # Drop the columns that we won't use afterwards
    df = df.drop(["molecule_ID","concentration","mz_estimated_total",],axis=1,)

    # Keep only the current section
    df = df[df['section_ix'] == section_index-1]

    # Return a numpy array of mz values
    return np.sort(np.array(df['mz_estimated'], dtype = np.float32))


@njit
def filter_peaks(array_spectra, array_peaks, array_mz_lipids):
    """This function is used to filter out all the spectrum data in 'array_spectra' that 
    has not been annotated as peak in 'array_peaks' and that do not belong to 'array_mz_lipids'.

    Args:
        array_spectra (np.ndarray): A numpy array containing spectrum data (pixel index, m/z and 
            intensity), sorted by mz (but not necessarily by pixel index).
        array_peaks (np.ndarray): A numpy array containing the peak annotations (min peak, max peak, 
            number of pixels containing the peak, average value of the peak), sorted by min_mz.
        array_mz_lipids (np.ndarray): A 1-D numpy array containing the mz values of the lipids we 
            want to visualize.
        verbose (bool): If True, some prints are displayed for debugging purposes. Default to False.

    Returns:
        list: m/z values corresponding to peaks that have been annotated and belong to lipids we 
            want to visualize.
        list: m/z values of the lipids the lipids we want to visualize that have been kept.
    """
    # Define initial values
    l_to_keep = []
    idx_peak = 0
    idx_curr_mz = 0
    idx_lipid = 0
    l_n_pix = []
    mz_lipid = array_mz_lipids[idx_lipid]
    l_mz_lipids_kept = []    
    # Need to initialize the set with an int inside and then delete it because numba is retarded
    set_pix = {0}
    set_pix.remove(0)

    while (idx_curr_mz < array_spectra.shape[0] and idx_peak < array_peaks.shape[0]):
        idx_pix, mz, intensity = array_spectra[idx_curr_mz]
        min_mz, max_mz, n_pix, mz_estimated = array_peaks[idx_peak]

        # Either we are before the current window
        if mz <= min_mz:
            idx_curr_mz += 1

        # Either current mz is in the current window
        elif mz >= min_mz and mz <= max_mz:
            # Adapt the index of the current lipid
            while mz_lipid < min_mz and idx_lipid<array_mz_lipids.shape[0]:
                idx_lipid+=1
                mz_lipid = array_mz_lipids[idx_lipid]

            # If we've explored all lipids already, exit the loop
            if idx_lipid == array_mz_lipids.shape[0]:
                break
            
            # If mz lipid is not in the current peak, move on to the next
            if mz_lipid > max_mz or np.abs(mz_estimated - mz_lipid)>2*10**-4:
                #idx_curr_mz += 1
                idx_peak += 1
                l_n_pix.append(len(set_pix))
                set_pix.clear()                
            else: 
                # mz belong to a lipid we want to visualize             
                l_to_keep.append(idx_curr_mz)
                set_pix.add(idx_pix)
                idx_curr_mz += 1
                if len(l_mz_lipids_kept)==0:
                    l_mz_lipids_kept.append(mz_lipid)
                elif mz_lipid!=l_mz_lipids_kept[-1]:
                    l_mz_lipids_kept.append(mz_lipid)

        # Either we're beyond, in which cas we move the window, and record the number of unique 
        # pixels in the window for later check
        else:
            idx_peak += 1
            l_n_pix.append(len(set_pix))
            set_pix.clear()

    
    # * This piece of code is commented because it is not usable as such since the introduction of
    # * array_mz_lipids as argument in the function, but it should still work if molecules not 
    # * belonging to array_mz_lipids are not excluded
    # if verbose:
    #     # Check that the pixel recorded are identical to the expected number of pixels recorded
    #     print(
    #         "Difference between number of recorded pixels",
    #         np.sum(np.array(l_n_pix) - array_peaks[:, 2]),
    #     )
    #     print(np.array(l_n_pix)[-10:])
    #     print(array_peaks[-10:, 2])

    return l_to_keep, l_mz_lipids_kept



In [5]:
# Get slice path
slice_index = t_index_path[0]
name = t_index_path[1]

# Load file in high and low resolution
print("Loading files : " + name)
smz_high_res = load_file(name, resolution=1e-5)
image_shape = smz_high_res.img_shape



Loading files : /data/lipidatlas/data/data_raw/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um


Loading Sprectra at resolution 1e-05: 100%|██████████| 74382/74382 [00:36<00:00, 2020.96it/s]
Loading the m/z values at resolution 1e-05: 100%|██████████| 74382/74382 [00:40<00:00, 1846.35it/s]


In [6]:
# Load df with different sortings (low_res will be averaged over m/z afterwards)
print("Creating and sorting dataframes")
df_high_res = process_sparse_matrix(smz_high_res, sort="m/z")
# Convert df into arrays for easier manipulation with numba
array_high_res = df_high_res.to_numpy()

print(array_high_res)

Creating and sorting dataframes
[[16053.           399.99847      154.28901672]
 [25948.           399.99859       97.53426361]
 [26059.           399.99865      114.94727325]
 ...
 [  945.          1599.98657      109.41999817]
 [33881.          1599.98718      130.5670166 ]
 [46157.          1599.98791      129.86520386]]


In [8]:
# Get the TIC per pixel for normalization (must be done before filtering out peaks)
array_TIC = compute_TIC_per_pixel(array_high_res, image_shape[0]*image_shape[1])
print(array_TIC[:3], array_TIC[-3:])


[ 2428.39  76434.164 62198.562] [608459.44 564191.56 609122.94]


In [9]:
array_high_res_temp = np.copy(array_high_res)

In [123]:
array_high_res = np.copy(array_high_res_temp)

In [124]:
# Normalize per TIC
array_high_res = normalize_per_TIC_per_pixel(array_high_res, array_TIC)

In [125]:
print(array_high_res)

[[1.60530000e+04 3.99998470e+02 7.90472953e-05]
 [2.59480000e+04 3.99998590e+02 8.21656337e-05]
 [2.60590000e+04 3.99998650e+02 7.39860517e-05]
 ...
 [9.45000000e+02 1.59998657e+03 6.14320742e-04]
 [3.38810000e+04 1.59998718e+03 1.36067671e-04]
 [4.61570000e+04 1.59998791e+03 1.24361924e-04]]


In [126]:
# Keep only the annotated peaks
appendix = "_filtered"

# Get the peak annotation file
array_peaks = load_peak_file(name)

# Get the list of m/z values to keep for visualization
array_mz_lipids = load_lipid_file(1, path = 'data/annotations/df_match.csv')

# Filter out all the undesired values
l_to_keep_high_res, l_mz_lipids_kept = filter_peaks(array_high_res, array_peaks, array_mz_lipids)
array_high_res = array_high_res[l_to_keep_high_res]

In [2]:
import scipy.stats as sps
from scipy.interpolate import interp1d

def load_parameters_files(path = "/data/lipidatlas/data/processed/normalization_parameters/"):
    # TODO docstring 
    # array_s1 and array_s2 contain n_slices parameters (ordered by increasing slice index)  
    array_s1=pd.read_csv(path + 'sigma_s1.csv', sep=',',header=None, skiprows = 0)[:,1].to_numpy()
    array_s2=pd.read_csv(path + 'sigma_s2.csv', sep=',',header=None, skiprows = 0)[:,1].to_numpy()

    # array_v1 and array_v2 contain n_lipids parameters (1st column is average mz of the lipid
    array_v1=pd.read_csv(path + 'sigma_v1.csv', sep=',',header=None, skiprows = 0)[:,1].to_numpy()
    array_v2=pd.read_csv(path + 'sigma_v2.csv', sep=',',header=None, skiprows = 0)[:,1].to_numpy()

    # array_mz contains the m/z values of the n_lipids parameters
    array_mz = pd.read_csv(path + 'sigma_v1.csv', sep=',',header=None, skiprows = 0)[:,0].to_numpy()
    
    # array_u1 and array_u2 contain n_slices * n_lipids parameters (n_lipids rows). Refer 
    # to array_mz for the corresponding column indices.
    array_u1=pd.read_csv(path + 'u1.csv', sep=',',header=None, skiprows = 0)[:,1:].to_numpy()
    array_u2=pd.read_csv(path + 'u2.csv', sep=',',header=None, skiprows = 0)[:,1:].to_numpy()

    return array_s1, array_s2, array_v1, array_v2, array_mz, array_u1, array_u2

def make_inv_cdf(mu1, mu2, sigma1, sigma2, resolution=2000, rel_range=6):
    # TODO docstring 
    mu_min, mu_max = min(mu1, mu2), max(mu1, mu2)
    sigma_min, sigma_max = min(sigma1, sigma2), max(sigma1, sigma2)
    domain = np.linspace(mu_min - rel_range*sigma_max,
                         mu_max + rel_range*sigma_max,
                         resolution)
    cdf_vals = (sps.norm.cdf(domain, mu1, sigma1) + sps.norm.cdf(domain, mu2, sigma2)) / 2.
    icdf = interp1d(cdf_vals, domain, bounds_error=False, fill_value="extrapolate", copy=True)
    return icdf

def cdf_mixture(x, mu1, mu2, sigma1, sigma2):
    # TODO docstring 
    return (sps.norm.cdf(x, mu1, sigma1) + sps.norm.cdf(x, mu2, sigma2)) / 2.

def transform_byicdf(x, mu1x, mu2x, sigma1x, sigma2x, icdfy):
    # TODO correct docstring 
    """Usage: pass as icdfy the function returned by make_inv_cdf
    
    For example:
    xnew = np.linspace(0, 10, 1000)
    icdf = make_inv_cdf(-4, 0, 2, 2)
    newx = transform_byicdf(xnew, 4, 8, 1, 1, icdf)
    """
    return icdfy(cdf_mixture(x,mu1x, mu2x, sigma1x, sigma2x))   

def standardize_values(array_high_res, array_peaks):
    # TODO finish docstring
    """This function rescale the intensity values of the lipids annotated with a Combat-like method 
    as part of the MAIA pipeline, using pre-established parameters.

    Args:
        array_high_res (_type_): _description_
        array_peaks (_type_): _description_

    Returns:
        _type_: _description_
    """
    # First normalize array_high_res to its TIC

    
    print("SO FAR SO GOOD")
    return array_high_res

In [3]:
array_high_res = standardize_values(array_high_res, array_peaks)

NameError: name 'array_high_res' is not defined

In [None]:
########################################################################################

In [3]:
multiprocessing = False
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names)]


Loading files : /data/lipidatlas/data/data_raw/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um/20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um






Loading Sprectra at resolution 1e-05: 100%|██████████| 74382/74382 [00:36<00:00, 2049.72it/s]
Loading the m/z values at resolution 1e-05: 100%|██████████| 74382/74382 [00:32<00:00, 2312.01it/s]


SystemError: CPUDispatcher(<function search2sorted at 0x7f2146a77e50>) returned a result with an error set

### Build lookup tables

In [3]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=12) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  551.95
Shape of lookup_table_spectra_high_res:  (2000, 72345)
Size (in mb) of lookup_table_spectra_high_res:  604.1
Shape of lookup_table_spectra_high_res:  (2000, 79180)
Size (in mb) of lookup_table_spectra_high_res:  615.78
Shape of lookup_table_spectra_high_res:  (2000, 80712)
Size (in mb) of lookup_table_spectra_high_res:  567.49
Shape of lookup_table_spectra_high_res:  (2000, 74382)
Size (in mb) of lookup_table_spectra_high_res:  821.78
Shape of lookup_table_spectra_high_res:  (2000, 107712)
Size (in mb) of cumulated_image_lookup_table_high_res:  551.95
Shape of cumulated_image_lookup_table_high_res:  (2000, 195, 371)
Size (in mb) of cumulated_image_lookup_table_high_res:  604.1
Shape of cumulated_image_lookup_table_high_res:  (2000, 214, 370)
Size (in mb) of lookup_table_spectra_high_res:  934.98
Shape of lookup_table_spectra_high_res:  (2000, 122550)
Size (in mb) of lookup_table_spectra_high_res:  847.69
Shape of lookup_table_spect

### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [7]:
output_folder = "data/whole_dataset/"
input_folder = "notebooks/server/data/temp/"
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):

    # Extract slice index
    slice_index = int(slice_name.split("_")[1])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]

    # Print array size
    #print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes/ 1024 / 1024,2))
    print(round(array_spectra_high_res.nbytes/ 1024 / 1024,2))
    print(round(array_averaged_mz_intensity_low_res.nbytes/ 1024 / 1024,2))
    print(round(array_averaged_mz_intensity_high_res.nbytes/ 1024 / 1024,2))
    print(round(lookup_table_spectra_high_res.nbytes/ 1024 / 1024,2))
    print(round(cumulated_image_lookup_table_high_res.nbytes/ 1024 / 1024,2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes/ 1024 / 1024,2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_cumulated_lookup_mz_image_shape"] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


0.85
1153.77
0.0
14.49
847.69
847.69
0.01
0.95
1651.25
0.0
17.26
946.78
946.78
0.01
0.8
1119.94
0.0
13.69
797.56
797.56
0.01
0.98
417.77
0.0
3.56
976.79
976.79
0.01
0.89
505.57
0.0
4.47
889.32
889.32
0.01
1.0
627.08
0.0
5.61
1004.73
1004.73
0.01
1.02
1588.25
0.0
14.67
1021.8
1021.8
0.01
0.99
717.17
0.0
6.01
985.34
985.34
0.01
1.05
575.92
0.0
6.53
1052.35
1052.35
0.01
0.7
548.27
0.0
6.06
700.93
700.93
0.01
0.57
780.53
0.0
7.79
567.49
567.49
0.01
0.8
515.07
0.0
4.67
803.65
803.65
0.01
0.65
289.82
0.0
3.41
646.29
646.29
0.01
0.66
1607.22
0.0
14.74
656.98
656.98
0.01
0.71
1080.59
0.0
12.65
714.11
714.11
0.01
0.82
1119.61
0.0
12.02
818.3
818.3
0.01
0.65
919.0
0.0
10.49
650.05
650.05
0.01
0.64
1136.39
0.0
12.69
643.31
643.31
0.01
0.77
1673.98
0.0
16.13
771.97
771.97
0.01
0.73
1088.1
0.0
10.75
726.01
726.01
0.01
0.7
1463.69
0.0
12.37
697.4
697.4
0.01
0.6
879.32
0.0
10.71
604.1
604.1
0.01
0.78
2342.25
0.0
16.69
778.4
778.4
0.01
0.65
1702.48
0.0
14.47
647.32
647.32
0.01
0.52
1343.16
0.0
14.44
5

Clean temporary folder

In [5]:
clean = True
if clean:
    delete_all_files_in_folder(input_folder)


In [9]:
# ! Quick fix for array_avg_low_res (to be deleted later)
from modules.tools.spectra import reduce_resolution_sorted_array_spectra
from modules.maldi_data import MaldiData
output_folder = "data/whole_dataset/"
with open(output_folder + "light_arrays.pickle", "rb") as handle:
        dic_light  = pickle.load(handle)

data = MaldiData()
for i in range(1,33):
        #print(len(dic_light[i]["array_avg_spectrum_downsampled"]))
        array_averaged_mz_intensity_low_res = reduce_resolution_sorted_array_spectra(
        data.get_array_avg_spectrum(slice_index=i), resolution=10 ** -2
    )
        dic_light[i]["array_avg_spectrum_downsampled"] = array_averaged_mz_intensity_low_res

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_light, handle)