In [2]:
import pandas as pd
import numpy as np

# Common parameters
min_intensity = 0.01
ppm = 1e-6  # Global variable

Mode = "Pos"
# MS2 filter
mass_tolerance_NL = 30 * ppm  # 10
mass_tolerance_MS2 = 25 * ppm  # 15

Fentanyl = 1
modification_types = []
if Fentanyl == 1:
    modification_types.append("Fentanyl")
    
# Read MS1 peak table
peak_table = pd.read_csv("Fentanyl_lib.csv", encoding='latin1')

# Read Biomarker MS2 list
characteristic_MS2_path = "MS2 list.xlsx"
biomarker_MS2DB = pd.read_excel(characteristic_MS2_path)
characteristic_MS2DB  = biomarker_MS2DB

# Determine the NL and MS2 lists to filter
# Assuming MS2DB_subset is a DataFrame
if any(characteristic_MS2DB['MS2_characteristics'] == "NL"):
    NL_candidates = characteristic_MS2DB.loc[characteristic_MS2DB['MS2_characteristics'] == "NL", 'Mass'].tolist()
else:
    NL_candidates = []

if any(characteristic_MS2DB['MS2_characteristics'] == "ProIon"):
    MS2_peaks = characteristic_MS2DB.loc[(characteristic_MS2DB['MS2_characteristics'] == "ProIon") & (characteristic_MS2DB['Mode'] == Mode), 'Mass'].tolist()
else:
    MS2_peaks = []

# Process peak table
result = pd.DataFrame()
result_peak = pd.DataFrame(columns=range(len(characteristic_MS2DB)))

# Determine the NL and MS2 lists to filter
MS2DB_subset = pd.DataFrame()
for modification_type in modification_types:
    subset_of_ms2db = biomarker_MS2DB[biomarker_MS2DB["Modification_type"] == modification_type]
    MS2DB_subset = pd.concat([MS2DB_subset, subset_of_ms2db])

# Assuming MS2DB_subset is a DataFrame
if any(MS2DB_subset['MS2_characteristics'] == "NL"):
    NL_candidates = MS2DB_subset.loc[MS2DB_subset['MS2_characteristics'] == "NL", 'Mass'].tolist()
else:
    NL_candidates = []

if any(MS2DB_subset['MS2_characteristics'] == "ProIon"):
    MS2_peaks = MS2DB_subset.loc[(MS2DB_subset['MS2_characteristics'] == "ProIon") & (MS2DB_subset['Mode'] == Mode), 'Mass'].tolist()
else:
    MS2_peaks = []

# Process peak table
result = pd.DataFrame()
result_peak = pd.DataFrame(columns=range(len(MS2DB_subset)))

# Iterate over all peaks
for i in range(len(peak_table)):
    precursor = peak_table["Precursor m/z"][i]
    ms2_peak_table = str(peak_table["MSMS spectrum"][i]).split()  # Split by spaces
    if ms2_peak_table != ['nan']:
        # Clean spectra
        ms2_peak_table = pd.DataFrame([x.split(":") for x in ms2_peak_table], columns=["mz", "intensity"])
        ms2_peak_table['intensity'] = pd.to_numeric(ms2_peak_table['intensity'])
        ms2_peak_table['mz'] = pd.to_numeric(ms2_peak_table['mz'])
        ms2_peak_table["RelativeIntensity"] = ms2_peak_table['intensity'] / max(ms2_peak_table['intensity'])

        nl_results = []
        ms2_results = []

        for k in range(len(NL_candidates)):
            nl_candidate = []
            for j in range(len(ms2_peak_table)):
                if ms2_peak_table["RelativeIntensity"][j] >= min_intensity:
                    difference = abs(precursor - ms2_peak_table["mz"][j] - NL_candidates[k])
                    if difference < mass_tolerance_NL:
                        nl_candidate.append(1)  # Add candidate result to the list
                        break  # Exit inner loop when a match is found
            if nl_candidate:  # If the list is not empty
                nl_results.append(1)
            else:
                nl_results.append(0)

        for k in range(len(MS2_peaks)):
            ms2_candidate = []
            for j in range(len(ms2_peak_table)):
                if ms2_peak_table["RelativeIntensity"][j] >= min_intensity:
                    difference = abs(MS2_peaks[k] - ms2_peak_table["mz"][j])
                    if difference < mass_tolerance_MS2:
                        ms2_candidate.append(1)  # Add candidate result to the list
                        break  # Exit inner loop when a match is found
            if ms2_candidate:  # If the list is not empty
                ms2_results.append(1)
            else:
                ms2_results.append(0)

    else:
        nl_results = ["noMS2"] * len(NL_candidates)
        ms2_results = ["noMS2"] * len(MS2_peaks)

    single_result = nl_results + ms2_results

    result_peak = result_peak.append(pd.Series(single_result), ignore_index=True)

result = pd.concat([peak_table, result_peak], axis=1)

# Rename columns to xx-NL-formula
nl_modification_types = MS2DB_subset["Modification_type"][MS2DB_subset["MS2_characteristics"] == "NL"]
nl_formulas = MS2DB_subset["Formula"][MS2DB_subset["MS2_characteristics"] == "NL"]
nl_names = ["NL" + str(x) + "_" + str(y) for x, y in zip(nl_modification_types, nl_formulas)]

# Check for fragment ions
proion_modification_types = MS2DB_subset["Modification_type"][MS2DB_subset["MS2_characteristics"] == "ProIon"]
proion_formulas = MS2DB_subset["Formula"][MS2DB_subset["MS2_characteristics"] == "ProIon"]
proion_names = ["ProIon" + str(x) + "_" + str(y) for x, y in zip(proion_modification_types, proion_formulas)]

combined_vector = nl_names + proion_names
result.columns = list(result.columns[:-len(combined_vector)]) + combined_vector

# Enhance readability of results
# Get the index of the 'MSMS spectrum' column
msms_spectrum_index = result.columns.get_loc('MSMS spectrum')

# Calculate the sum from the 'MSMS spectrum' column to the last column
sum_column = result.iloc[:, msms_spectrum_index + 2:].sum(axis=1)

# Insert the calculated sum before the last column
result.insert(len(result.columns), 'MS2 feature', sum_column)

# Custom function to replace elements containing 'noMS2' with NaN
def replace_noMS2(value):
    if 'noMS2' in str(value):
        return np.nan
    return value

# Replace elements in the 'MS2 feature' column containing 'noMS2' with NaN
result['MS2 feature'] = result['MS2 feature'].apply(replace_noMS2)

# Output results
result.to_csv('result-lib.csv', index=False)
