# Notebook 2
Compute individual peak annotation, merge and export them as a single csv file.

### Load important modules

In [None]:
# Standard modules
import numpy as np
import pandas as pd
import os

# Multithreading
from threadpoolctl import threadpool_limits

# Move to root directory for easier module import
os.chdir("../../")

# LBAE modules
from modules.tools.maldi_conversion import (
    load_peak_file,
    get_standardized_values,
)

# Set thread limit
threadpool_limits(16)


### Create a list of raw data filenames

In [None]:
path_brain_1 = "/data/lipidatlas/data/data_raw/BRAIN1/"
path_brain_2 = "/data/lipidatlas/data/data_raw/BRAIN2/"
path_brain_1_temp = "/data/lipidatlas/data/app/data/temp/brain_1"
path_brain_2_temp = "/data/lipidatlas/data/app/data/temp/brain_2"
split_value_1 = "MouseBrainCMC_S"
split_value_2 = "MouseBrain2_S"
ll_t_names = []
remove_duplicates = True
for path_brain, path_brain_temp, split_value in zip(
    [path_brain_1, path_brain_2],
    [path_brain_1_temp, path_brain_2_temp],
    [split_value_1, split_value_2],
):
    # Load filenames
    l_t_names = sorted(
        [
            [
                int(name.split(split_value)[1].split("_")[0].split("A")[0].split("(")[0]),
                path_brain + name + "/" + name,
            ]
            for name in os.listdir(path_brain)
            if "MouseBrain" in name
        ]
    )

    # Correct for duplicates
    for idx, (t_names_1, t_names_2) in enumerate(zip(l_t_names[:-1], l_t_names[1:])):
        if t_names_2[0] == t_names_1[0]:
            t_names_2.append("bis")
            print("WARNING: duplicate for slice " + str(t_names_1[0]))
            if remove_duplicates:
                del l_t_names[idx + 1]

    # Remove slices that have already been processed
    os.makedirs(path_brain_temp, exist_ok=True)
    remove_already_loaded = False
    if remove_already_loaded:
        existing_names = [
            int(name.split("_")[1][:-7]) for name in os.listdir(path_brain_temp) if "raw" in name
        ]
        l_t_names = [x for x in l_t_names if x[0] not in existing_names]

    # Print the final list of names
    for t_names in l_t_names:
        print(t_names[0], t_names[1].split("/")[-1])

    ll_t_names.append(l_t_names)


In [None]:
# Merge lists and update slice index for brain_2
for idx in range(len(ll_t_names[1])):
    ll_t_names[1][idx][0] += 22

l_t_names = ll_t_names[0] + ll_t_names[1]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


### Compute and export the annotations

In [None]:
# Load and clean lcms
lipid_list = pd.read_excel(
    "/data/lipidatlas/annotations/lcms_mar2022.xlsx", header=0
).drop_duplicates()

lipid_list["Lipids"] = lipid_list["Lipids"].map(
    lambda x: x.replace(")", " ").replace("(", " ").strip()
)

lipid_list[["name", "structure"]] = lipid_list["Lipids"].str.split(
    " ",
    1,
    expand=True,
)
lipid_list["Adducts"] = lipid_list["Adducts"].map(lambda x: x.replace(" ", ""))
lipid_list["cation"] = lipid_list["Adducts"].map(
    lambda x: x.split("+")[1].split("]")[0] if "+" in x else x
)
lipid_list["theoretical m/z"] = lipid_list["m/z"]
lipid_list.drop(["Lipids", "Adducts", "m/z"], inplace=True, axis=1)
lipid_list


In [None]:
# Set peak distance treshold for annotation
THRESH = 0.003

# Define the dataframe that will be filled with annotations
df_with_lipid_boundaries = None

# Loop overs slices
for t_names in l_t_names:
    index = t_names[0]
    name = t_names[1]

    # Handle repeated slices
    bis = False
    if len(t_names) > 2:
        print("WARNING: repeated slice: ", index)
        pass

    # Load peak interval data for each slice
    df = load_peak_file(name, array=False)

    # Build empty df first iteration
    if df_with_lipid_boundaries is None:
        df_with_lipid_boundaries = pd.DataFrame(
            columns=["slice"] + list(lipid_list.columns) + list(df.columns)
        )

    for name, row in df.iterrows():
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - row.mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(row.mz_estimated - mz_hit.iloc[3]) < THRESH:
            test = pd.Series(data=[index], name="slice")
            df_row = row.append(mz_hit).to_frame().transpose().assign(slice=test)
            if bis:
                df_row["slice"] *= 1000
            df_with_lipid_boundaries = df_with_lipid_boundaries.append(df_row)


df_with_lipid_boundaries.to_csv("data/annotations/lipid_annotation.csv", index=False)
df_with_lipid_boundaries


### Find out which lipids have been transformed

In [None]:
def build_df_transformed_lipids(brain_1=True):
    # Slice index is irrelevant here
    _, l_lipids_float, _, _ = get_standardized_values(
        slice_index=1,
        path_array_data="/data/lipidatlas/data/processed/brain1/BRAIN1"
        if brain_1
        else "/data/lipidatlas/data/processed/BRAIN2",
        path_array_transformed_data="/data/lipidatlas/data/processed/brain1/BRAIN1_normalized"
        if brain_1
        else "/data/lipidatlas/data/processed/BRAIN2_normalized",
    )

    # Set peak distance treshold for annotation
    THRESH = 0.003

    # Define the dataframe that will be filled with annotations
    df_transformed_lipids = pd.DataFrame(columns=list(lipid_list.columns))

    for mz_estimated in l_lipids_float:
        # For each mz_mean, find the closest value in lipid_list
        idx = np.argmin(np.abs(lipid_list.iloc[:, 3] - mz_estimated))
        mz_hit = lipid_list.iloc[idx]

        # Calculate if the difference is within the threshold
        if abs(mz_estimated - mz_hit.iloc[3]) < THRESH:
            df_transformed_lipids = df_transformed_lipids.append(mz_hit)

    df_transformed_lipids.to_csv(
        "data/annotations/transformed_lipids_brain_1.csv"
        if brain_1
        else "data/annotations/transformed_lipids_brain_2.csv",
        index=False,
    )
    df_transformed_lipids.reset_index(drop=True, inplace=True)
    return df_transformed_lipids


build_df_transformed_lipids(brain_1=False)
build_df_transformed_lipids(brain_1=True)


In [None]:
df_annotations_MAIA_transformed_lipids_brain_1 = pd.read_csv(
    "data/annotations/transformed_lipids_brain_1.csv"
)
df_annotations_MAIA_transformed_lipids_brain_1[
    "name"
] = df_annotations_MAIA_transformed_lipids_brain_1["name"].map(lambda x: x.split("_")[1])

for index, (
    name,
    structure,
    cation,
    mz,
) in df_annotations_MAIA_transformed_lipids_brain_1.iterrows():
    print(name, structure, cation, mz)
