# Notebook 2
Compute individual peak annotation, merge and export them as a single csv file.

### Load important modules

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import os

# Multithreading
from threadpoolctl import threadpool_limits

# Move to root directory for easier module import
os.chdir("../../")

# Set thread limit
threadpool_limits(16)



<threadpoolctl.threadpool_limits at 0x7f603077d6d0>

### Create a list of raw data filenames

In [2]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])

1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454

### Compute and export the annotations

In [3]:
# Load and clean lcms
lipid_list = pd.read_excel("/data/lipidatlas/annotations/lcm_july2021.xlsx", header=None).drop_duplicates()
lipid_list.iloc[:, 2] = lipid_list.iloc[:, 2].apply(lambda x: x.split(" ")[0])
lipid_list.drop_duplicates(inplace=True)
lipid_list.columns = ["name", "structure", "cation", "theoretical m/z"]

# Add additionnal annotations
lipid_list_2 = pd.read_excel("/data/lipidatlas/annotations/lcm_dec2021.xlsx", header=None).drop_duplicates()
lipid_list_2.iloc[:, 2] = lipid_list_2.iloc[:, 2].apply(lambda x: x.split(" ")[0])
lipid_list_2.drop_duplicates(inplace=True)
lipid_list_2.columns = ["name", "structure", "cation", "theoretical m/z", '_', '__']
lipid_list_2 = lipid_list_2.drop(["_", "__"],axis=1)

# Merge annotations
lipid_list = pd.concat([lipid_list,lipid_list_2]).drop_duplicates(subset = ["name", "structure", "cation"], keep='last')

# Set peak distance treshold for annotation
THRESH = 0.003

# Define the dataframe that will be filled with annotations
df_with_lipid_boundaries = None

# Loop overs slices
for t_names in l_t_names:
    index = t_names[0]
    name = t_names[1]

    # Handle repeated slices
    bis = False
    if len(t_names) > 2:
        print("WARNING: repeated slice: ", index)
        pass

    # Load peak interval data for each slice (file columns depend on the version)
    try:
        df = pd.read_csv(name + ".csv", sep="\t")
        df = df.drop(
            [
                "Unnamed: 0",
                "pixel_max_hits",
                "percent_1_hit",
                "concentration",
                "median_intensity",
                "tissue",
                "difference",
                "matrix",
            ],
            axis=1,
        )
    except:
        df = pd.read_csv(name + ".csv", sep=",")
        df = df.drop(
            ["Unnamed: 0", "pixel_max_hits", "percent_1_hit", "concentration", "median_intensity", "difference",],
            axis=1,
        )

    # Sort annotation by mz value for the lower bound of the peak
    df = df.sort_values(by="min", axis=0)

    # Build empty df first iteration
    if df_with_lipid_boundaries is None:
        df_with_lipid_boundaries = pd.DataFrame(columns=["slice"] + list(lipid_list.columns) + list(df.columns))

    for name, row in df.iterrows():
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - row.mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(row.mz_estimated - mz_hit.iloc[3]) < THRESH:
            test = pd.Series(data=[index], name="slice")
            df_row = row.append(mz_hit).to_frame().transpose().assign(slice=test)
            if bis:
                df_row["slice"] *= 1000
            df_with_lipid_boundaries = df_with_lipid_boundaries.append(df_row)


df_with_lipid_boundaries.to_csv("data/annotations/lipid_annotation.csv", index=False)



In [4]:
df_with_lipid_boundaries

Unnamed: 0,slice,name,structure,cation,theoretical m/z,min,max,num_pixels,mz_estimated
0,1,GP01_PC,PCOTheo_8:1,k,420.154791,420.1517,420.1554,2087.0,420.1535
0,1,GP01_PC,PCdOTheo_14:2,H,450.29788,450.2964,450.3,725.0,450.298
0,1,GP01_PC,PCOTheo_13:0,H,454.29282,454.2918,454.2955,846.0,454.2934
0,1,GP01_LysoPC,LPCOTheo_15:2,H,464.31354,464.3087,464.3116,542.0,464.3106
0,1,GP01_LysoPC,LPCOTheo_15:2,H,464.31354,464.3121,464.3161,1016.0,464.3139
...,...,...,...,...,...,...,...,...,...
0,32,GL03_TAG,TGOTheo_76:22,NH4,1158.88477,1158.8813,1158.8864,6438.0,1158.8839
0,32,GL03_TAG,TGTheo_72:2,na,1162.07,1162.0648,1162.0764,26261.0,1162.072
0,32,GL03_TAG,TGTheo_72:2,k,1178.043832,1178.0438,1178.0446,222.0,1178.0445
0,32,GL03_TAG,TGOTheo_74:1,k,1194.111462,1194.1071,1194.1127,8094.0,1194.1105
