In [9]:
import pandas as pd

# Read the Met-fentanyl.txt file into a DataFrame
file_path = 'Met-fentanyl.txt'
data = pd.read_csv(file_path, sep="\t")  # Adjust separator if necessary

# Remove rows where 'MSMS spectrum' is empty
data = data.dropna(subset=['MSMS spectrum'])

# Create 'Spectra' column by replacing line breaks and colons
data['Spectra'] = data['MSMS spectrum'].str.replace(' ', '\n').str.replace(':', ' ')

# Ensure 'Comment' is a string
data['Comment'] = data['Comment'].astype(str)

# Filter for isotope and remove unwanted comments
data = data[(data['Isotope'] == "M + 0") & 
            (~data['Comment'].str.contains("found in higher mz's MsMs"))]

# Handle adducts
data['PeakID'] = data['PeakID'].astype(str)
for index, row in data.iterrows():
    if "adduct linked to" in row['Comment']:
        comment_values = row['Comment'].split(';')
        id_values = [value.split("adduct linked to ")[1].split('_')[0].strip() for value in comment_values if "adduct linked to" in value]
        
        # Filter matching rows
        matching_rows = data[data['PeakID'].isin(id_values)]
        if not matching_rows.empty:
            combined_rows = pd.concat([row.to_frame().T, matching_rows])
            max_area = combined_rows['Area'].max()
            if row['Area'] < max_area:
                data = data.drop(index)
                matching_rows_to_remove = matching_rows[matching_rows['Area'] < max_area].index
                data = data.drop(matching_rows_to_remove)

# Reset index
data.reset_index(drop=True, inplace=True)

# Clean MS1 with thresholds
sn_threshold = 3  # S/N threshold
area_threshold = 10000  # Area threshold
data = data.query('`S/N` >= @sn_threshold and `Area` >= @area_threshold')

# Remove duplicate peaks based on thresholds
mz_threshold = 0.005
rt_threshold = 0.2
data = data.sort_values('Precursor m/z')

# Group by thresholds and keep maximum Area
groups = []
while not data.empty:
    group_mask = (data['Precursor m/z'].sub(data.iloc[0]['Precursor m/z']).abs() <= mz_threshold) & \
                 (data['RT (min)'].sub(data.iloc[0]['RT (min)']).abs() <= rt_threshold)
    group = data[group_mask]
    largest_area_row = group.loc[group['Area'].idxmax()]
    groups.append(largest_area_row)
    data = data[~group_mask]

# Result DataFrame
peak_table = pd.DataFrame(groups)


# Save the results to XLSX
peak_table.to_excel("peak_clean.xlsx", index=False)
