# Notebook 2
Compute individual peak annotation, merge and export them as a single csv file.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import pandas as pd
import os

# Multithreading
from threadpoolctl import threadpool_limits

# Move to root directory for easier module import
os.chdir("../../")

# LBAE modules
from notebooks.data_processing.modules.maldi_conversion import (
    load_peak_file,
    get_standardized_values,
)

# Set thread limit
threadpool_limits(16)



<threadpoolctl.threadpool_limits at 0x7f9701040640>

### Create a list of raw data filenames

In [2]:
path_brain_1 =  "/data/lipidatlas/data/data_raw/BRAIN1/"
path_brain_2 =  "/data/lipidatlas/data/data_raw/BRAIN2/"
path_brain_1_temp = "/data/lipidatlas/data/app/data/temp/brain_1"
path_brain_2_temp = "/data/lipidatlas/data/app/data/temp/brain_2"
split_value_1 = "MouseBrainCMC_S"
split_value_2 = "MouseBrain2_S"
ll_t_names = []
remove_duplicates = True
for path_brain, path_brain_temp, split_value in zip([path_brain_1, path_brain_2], [path_brain_1_temp, path_brain_2_temp], [split_value_1,split_value_2]):
    # Load filenames
    l_t_names = sorted(
        [
            [
                int(name.split(split_value)[1].split("_")[0].split("A")[0].split("(")[0]),
                path_brain + name + "/" + name,
            ]
            for name in os.listdir(path_brain)
            if "MouseBrain" in name
        ]
    )

    # Correct for duplicates
    for idx, (t_names_1, t_names_2) in enumerate(zip(l_t_names[:-1], l_t_names[1:])):
        if t_names_2[0] == t_names_1[0]:
            t_names_2.append("bis")
            print("WARNING: duplicate for slice " + str(t_names_1[0]))
            if remove_duplicates:
                del l_t_names[idx+1]
            

    # Remove slices that have already been processed
    os.makedirs(path_brain_temp, exist_ok=True)
    remove_already_loaded = False
    if remove_already_loaded:
        existing_names = [int(name.split("_")[1][:-7]) for name in os.listdir(path_brain_temp) if "raw" in name]
        l_t_names = [x for x in l_t_names if x[0] not in existing_names]

    # Print the final list of names
    for t_names in l_t_names:
        print(t_names[0], t_names[1].split("/")[-1])

    ll_t_names.append(l_t_names)


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

In [3]:
# Merge lists and update slice index for brain_2
for idx in range(len(ll_t_names[1])):
    ll_t_names[1][idx][0]+=22
        
l_t_names = ll_t_names[0] + ll_t_names[1]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])

1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Compute and export the annotations

In [4]:
# Load and clean lcms
lipid_list = pd.read_excel(
    "/data/lipidatlas/annotations/lcms_mar2022.xlsx", header=0
).drop_duplicates()

lipid_list['Lipids'] = lipid_list['Lipids'].map(lambda x: x.replace(')', ' ').replace('(', ' ').strip())

lipid_list[['name', 'structure']] = lipid_list['Lipids'].str.split(' ', 1, expand=True,)
lipid_list['Adducts'] = lipid_list['Adducts'].map(lambda x: x.replace(' ', ''))
lipid_list['cation'] = lipid_list['Adducts'].map(lambda x: x.split('+')[1].split(']')[0] if '+' in x else x)
lipid_list["theoretical m/z"] = lipid_list['m/z']
lipid_list.drop(['Lipids', 'Adducts', 'm/z'], inplace=True, axis = 1)
lipid_list
# lipid_list.iloc[:, 2] = lipid_list.iloc[:, 2].apply(lambda x: x.split(" ")[0])
# lipid_list.drop_duplicates(inplace=True)
# lipid_list = lipid_list.drop([4], axis=1)


# lipid_list.columns = ["name", "structure", "cation", "theoretical m/z"]
# lipid_list


Unnamed: 0,name,structure,cation,theoretical m/z
0,LPC,O-16:2,H1,478.329190
1,LPC,O-16:1,H1,480.344850
2,LPC,O-17:2,H1,492.344850
3,LPC,16:0,H1,496.339750
4,LPC,O-16:2,na,500.311165
...,...,...,...,...
435,PIP,O-34:1,K,941.491700
436,PIP,O-36:5,Na,945.486500
437,SHexCer,42:1;O3,k,946.604991
438,CL,36:4,K,963.476100


In [5]:
# Set peak distance treshold for annotation
THRESH = 0.003

# Define the dataframe that will be filled with annotations
df_with_lipid_boundaries = None

# Loop overs slices
for t_names in l_t_names:
    index = t_names[0]
    name = t_names[1]

    # Handle repeated slices
    bis = False
    if len(t_names) > 2:
        print("WARNING: repeated slice: ", index)
        pass

    # Load peak interval data for each slice
    df = load_peak_file(name, array=False)

    # Build empty df first iteration
    if df_with_lipid_boundaries is None:
        df_with_lipid_boundaries = pd.DataFrame(
            columns=["slice"] + list(lipid_list.columns) + list(df.columns)
        )

    for name, row in df.iterrows():
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - row.mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(row.mz_estimated - mz_hit.iloc[3]) < THRESH:
            test = pd.Series(data=[index], name="slice")
            df_row = row.append(mz_hit).to_frame().transpose().assign(slice=test)
            if bis:
                df_row["slice"] *= 1000
            df_with_lipid_boundaries = df_with_lipid_boundaries.append(df_row)


df_with_lipid_boundaries.to_csv("data/annotations/lipid_annotation.csv", index=False)
df_with_lipid_boundaries


Unnamed: 0,slice,name,structure,cation,theoretical m/z,min,max,num_pixels,mz_estimated
0,1,LPC,O-16:2,H1,478.32919,478.3264,478.3334,70779.0,478.3296
0,1,LPC,O-16:1,H1,480.34485,480.3433,480.3471,20466.0,480.3453
0,1,LPC,O-17:2,H1,492.34485,492.3434,492.3468,6047.0,492.3453
0,1,LPC,16:0,H1,496.33975,496.3363,496.3415,48454.0,496.3403
0,1,LPC,O-16:2,na,500.311165,500.3093,500.314,7598.0,500.3114
...,...,...,...,...,...,...,...,...,...
0,64,PI-Cer,d42:1,H,892.6637,892.6614,892.665,10936.0,892.6633
0,64,PC,42:7,K,898.5723,898.5698,898.5734,10747.0,898.5717
0,64,SHexCer,41:1;O3,H2,908.64911,908.6481,908.6516,11530.0,908.65
0,64,PC,42:1,k,910.666151,910.6639,910.6673,14885.0,910.6657


### Find out which lipids have been transformed

In [8]:
def build_df_transformed_lipids(brain_1 = True):
    # Slice index is irrelevant here
    _, l_lipids_float, _, _ = get_standardized_values(slice_index=1,
                path_array_data="/data/lipidatlas/data/processed/brain1/BRAIN1" if brain_1 else "/data/lipidatlas/data/processed/BRAIN2",
                path_array_transformed_data="/data/lipidatlas/data/processed/brain1/BRAIN1_normalized" if brain_1 else "/data/lipidatlas/data/processed/BRAIN2_normalized")

    # Set peak distance treshold for annotation
    THRESH = 0.003

    # Define the dataframe that will be filled with annotations
    df_transformed_lipids = pd.DataFrame(columns=list(lipid_list.columns))

    for mz_estimated in l_lipids_float:
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(mz_estimated - mz_hit.iloc[3]) < THRESH:
            df_transformed_lipids = df_transformed_lipids.append(mz_hit)


    df_transformed_lipids.to_csv("data/annotations/transformed_lipids_brain_1.csv" if brain_1 else "data/annotations/transformed_lipids_brain_2.csv", index=False)
    df_transformed_lipids.reset_index(drop=True, inplace=True)
    return df_transformed_lipids

build_df_transformed_lipids(brain_1 = False)
build_df_transformed_lipids(brain_1 = True)

Unnamed: 0,name,structure,cation,theoretical m/z
0,LPC,16:0,H1,496.33975
1,Cer,36:1,k,604.506541
2,Cer,40:2,H1,620.5976
3,Cer,40:1,H1,622.61328
4,Cer,38:4,k,626.490911
5,Cer,42:1,H1,650.64459
6,PE,34:2,H1,716.52246
7,PE,34:1,H1,718.53815
8,PA,36:2,na,723.49365
9,PE,O-36:4,H1,726.54321


In [None]:
df_annotations_MAIA_transformed_lipids_brain_1 = pd.read_csv(
    "data/annotations/transformed_lipids_brain_1.csv"
)
df_annotations_MAIA_transformed_lipids_brain_1["name"] = df_annotations_MAIA_transformed_lipids_brain_1["name"].map(
    lambda x: x.split("_")[1]
)

#print(df_annotations_MAIA_transformed_lipids)
for index, (name, structure, cation, mz) in df_annotations_MAIA_transformed_lipids_brain_1.iterrows():
    print(name, structure, cation, mz)
