# Notebook 2
Compute individual peak annotation, merge and export them as a single csv file.

### Load important modules

In [2]:
# Standard modules
import numpy as np
import pandas as pd
import os

# Multithreading
from threadpoolctl import threadpool_limits

# Move to root directory for easier module import
os.chdir("../../")

# LBAE modules
from notebooks.data_processing.modules.maldi_conversion import (
    load_peak_file,
    get_standardized_values,
)

# Set thread limit
threadpool_limits(16)



<threadpoolctl.threadpool_limits at 0x7f5aa49e8cd0>

### Create a list of raw data filenames

In [3]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Compute and export the annotations

In [4]:
# Load and clean lcms
lipid_list = pd.read_excel(
    "/data/lipidatlas/annotations/lcm_nov2021.xlsx", header=None
).drop_duplicates()
lipid_list.iloc[:, 2] = lipid_list.iloc[:, 2].apply(lambda x: x.split(" ")[0])
lipid_list.drop_duplicates(inplace=True)
lipid_list = lipid_list.drop([4], axis=1)
lipid_list.columns = ["name", "structure", "cation", "theoretical m/z"]
lipid_list


Unnamed: 0,name,structure,cation,theoretical m/z
0,GP01_LysoPC,LPCOTheo_16:2,H,478.329190
1,GP01_LysoPC,1070004_16:1,H,480.344850
2,GP01_LysoPC,LPCOTheo_17:2,H,492.344850
3,GP01_LysoPC,1050018_16:0,H,496.339750
4,GP01_LysoPC,LPCOTheo_16:2,na,500.311165
...,...,...,...,...
169,SP06_Gangliosides,6020015_42:2,H,890.638550
170,SP06_Gangliosides,6020009_40:1,k,902.578801
171,SP06_Gangliosides,6020014_42:1,H,908.649110
172,SP06_Gangliosides,6020011_40:1,k,918.573681


In [5]:
# # Load and clean lcms
# lipid_list = pd.read_excel("/data/lipidatlas/annotations/lcm_july2021.xlsx", header=None).drop_duplicates()
# lipid_list.iloc[:, 2] = lipid_list.iloc[:, 2].apply(lambda x: x.split(" ")[0])
# lipid_list.drop_duplicates(inplace=True)
# lipid_list.columns = ["name", "structure", "cation", "theoretical m/z"]

# # Add additionnal annotations
# lipid_list_2 = pd.read_excel("/data/lipidatlas/annotations/lcm_dec2021.xlsx", header=None).drop_duplicates()
# lipid_list_2.iloc[:, 2] = lipid_list_2.iloc[:, 2].apply(lambda x: x.split(" ")[0])
# lipid_list_2.drop_duplicates(inplace=True)
# lipid_list_2.columns = ["name", "structure", "cation", "theoretical m/z", '_', '__']
# lipid_list_2 = lipid_list_2.drop(["_", "__"],axis=1)

# # Merge annotations
# lipid_list = pd.concat([lipid_list,lipid_list_2]).drop_duplicates(subset = ["name", "structure", "cation"], keep='last')

In [6]:
# Set peak distance treshold for annotation
THRESH = 0.003

# Define the dataframe that will be filled with annotations
df_with_lipid_boundaries = None

# Loop overs slices
for t_names in l_t_names:
    index = t_names[0]
    name = t_names[1]

    # Handle repeated slices
    bis = False
    if len(t_names) > 2:
        print("WARNING: repeated slice: ", index)
        pass

    # Load peak interval data for each slice
    df = load_peak_file(name, array=False)

    # Build empty df first iteration
    if df_with_lipid_boundaries is None:
        df_with_lipid_boundaries = pd.DataFrame(
            columns=["slice"] + list(lipid_list.columns) + list(df.columns)
        )

    for name, row in df.iterrows():
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - row.mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(row.mz_estimated - mz_hit.iloc[3]) < THRESH:
            test = pd.Series(data=[index], name="slice")
            df_row = row.append(mz_hit).to_frame().transpose().assign(slice=test)
            if bis:
                df_row["slice"] *= 1000
            df_with_lipid_boundaries = df_with_lipid_boundaries.append(df_row)


df_with_lipid_boundaries.to_csv("data/annotations/lipid_annotation.csv", index=False)
df_with_lipid_boundaries


Unnamed: 0,slice,name,structure,cation,theoretical m/z,min,max,num_pixels,mz_estimated
0,1,GP01_LysoPC,LPCOTheo_16:2,H,478.32919,478.3264,478.3334,70779.0,478.3296
0,1,GP01_LysoPC,1070004_16:1,H,480.34485,480.3433,480.3471,20466.0,480.3453
0,1,GP01_LysoPC,LPCOTheo_17:2,H,492.34485,492.3434,492.3468,6047.0,492.3453
0,1,GP01_LysoPC,1050018_16:0,H,496.33975,496.3363,496.3415,48454.0,496.3403
0,1,GP01_LysoPC,LPCOTheo_16:2,na,500.311165,500.3093,500.314,7598.0,500.3114
...,...,...,...,...,...,...,...,...,...
0,32,SP05_GlcCer,501AB13_32:0,k,874.565261,874.5609,874.566,23322.0,874.5635
0,32,SP06_Gangliosides,6020011_40:1,H,880.6178,880.6154,880.6203,3943.0,880.6176
0,32,GP01_PC,1010659_40:1,k,882.634841,882.6316,882.6365,9228.0,882.634
0,32,SP06_Gangliosides,6020014_42:1,H,908.64911,908.6476,908.6525,6741.0,908.65


### Find out which lipids have been transformed

In [7]:
# Slice index is irrelevant here
_, l_lipids_float, _, _ = get_standardized_values(slice_index=1)


# Set peak distance treshold for annotation
THRESH = 0.003

# Define the dataframe that will be filled with annotations
df_transformed_lipids = pd.DataFrame(columns=list(lipid_list.columns))

for mz_estimated in l_lipids_float:
    # For each mz_mean, find the closest value in lipid_list
    idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - mz_estimated))
    mz_hit = lipid_list.iloc[idx]

    # Calculate if the difference is within the threshold
    if abs(mz_estimated - mz_hit.iloc[3]) < THRESH:
        df_transformed_lipids = df_transformed_lipids.append(mz_hit)


df_transformed_lipids.to_csv("data/annotations/transformed_lipids.csv", index=False)
df_transformed_lipids.reset_index(drop=True, inplace=True)
df_transformed_lipids


Unnamed: 0,name,structure,cation,theoretical m/z
0,GP01_LysoPC,1050018_16:0,H,496.33975
1,SP02_CER,EX2010006_36:1,k,604.506541
2,SP02_CER,2010029_40:2,H,620.5976
3,SP02_CER,2010008_40:1,H,622.61328
4,SP02_CER,CerTheo_38:3,k,626.490911
5,SP02_CER,EX2010012_42:1,H,650.64459
6,GP02_PE,2010042_34:2,H,716.52246
7,GP02_PE,2010009_34:1,H,718.53815
8,GP10_PA,10010036_36:2,na,723.49365
9,GP02_PE,2020019_36:4,H,726.54321


In [28]:
df_annotations_MAIA_transformed_lipids = pd.read_csv(
    "data/annotations/transformed_lipids.csv"
)
df_annotations_MAIA_transformed_lipids["name"] = df_annotations_MAIA_transformed_lipids["name"].map(
    lambda x: x.split("_")[1]
)

#print(df_annotations_MAIA_transformed_lipids)
for index, (name, structure, cation, mz) in df_annotations_MAIA_transformed_lipids.iterrows():
    print(name, structure, cation, mz)


LysoPC 1050018_16:0 H 496.33975
CER EX2010006_36:1 k 604.506541
CER 2010029_40:2 H 620.5976
CER 2010008_40:1 H 622.61328
CER CerTheo_38:3 k 626.490911
CER EX2010012_42:1 H 650.64459
PE 2010042_34:2 H 716.52246
PE 2010009_34:1 H 718.53815
PA 10010036_36:2 na 723.4936499999999
PE 2020019_36:4 H 726.54321
PxCer 3010037_33:1 k 727.515081
PE 2020049_36:3 H 728.55884
PE 2020033_36:2 H 730.57452
PC 1010490_32:1 H 732.55377
PC 1010397_32:0 H 734.5694
PC 33:5 H 738.50684
PE 2010039_36:2 H 744.55377
PG 4010027_32:0 na 745.4971
PE 2030001_38:6 H 748.52759
PC 1010399_33:0 H 748.58508
PE 2020020_38:6 H 750.54321
PC 1010397_32:0 na 756.5513749999999
PE 2020036_38:2 H 758.60583
PE 2010095_38:6 H 764.52246
PE 2020049_36:3 k 766.514721
PE 2020033_36:2 k 768.530401
PC 1020003_34:1 na 768.5878049999999
PC 1010490_32:1 k 770.509651
PE 2030005_40:6 H 776.55884
PE 2010096_36:4 k 778.478341
GlcCer 501AA67_36:2 k 780.538641
PE 2010039_36:2 k 782.509651
PC 1010400_34:0 na 784.582685
PC 1010750_36:1 H 788.61639