10.1021/jo400708u 


https://massive.ucsd.edu/ProteoSAFe/dataset.jsp?task=d1e9bdcc3c15491ca1fc77bda589f702


https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000079618%2Fccms_peak%2FSF.mzXML
https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000079618%2Fraw%2FSF.mzXML

https://metabolomics-usi.gnps2.org/json/?usi1=mzspec%3AGNPS%3AGNPS-LIBRARY%3Aaccession%3ACCMSLIB00000579622

# Mass Spectrometry: GNPS & Visualization 

In [None]:
import requests
from tqdm import tqdm
import pandas as pd
import json
from rich import inspect
from IPython.display import Image, display
import plotly.graph_objects as go
from pathlib import Path

First let's download two GNPS library spectra for Surugamide A.
We will use CCMSLIB00000579271 and CCMSLIB00012471599.


https://library.gnps2.org

![](images/gnps_library.png)

We can use GNPS APIs to visualize as well

In [5]:
def plot(data):
    display(
    Image(
        data=data,
        width=800
        )
    )

def fetch_single_spectrum_image(spectrum):
    try:
        url = f"https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{spectrum}"
        response = requests.get(url)
        response.raise_for_status() 
        if response.headers.get('Content-Type') != 'image/png':
            raise ValueError("Response is not a PNG image")
        return plot(response.content)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
fetch_single_spectrum_image("CCMSLIB00000579271")

In [None]:
fetch_single_spectrum_image("CCMSLIB00012471599")


In [6]:
def fetch_modifinder_alignment(spectrum1, spectrum2, matches='default', ratio_to_base_peak=0.01, bar_width=1):
    try:
        url = f"https://modifinder.gnps2.org/api/visualizer/draw_alignment?spectrums=[%22{spectrum1}%22,%22{spectrum2}%22]&matches={matches}.png"
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        return response.content
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
display(
    Image(
        data=fetch_modifinder_alignment("CCMSLIB00000579271", "CCMSLIB00012471599", matches='default', ratio_to_base_peak=0.01, bar_width=1),
        width=500
        )
)


Let's write a better plot function. 

In [None]:
# https://ccms-ucsd.github.io/GNPSDocumentation/usi/
def get_peaks_from_gnps(ccmslib_accession=None, usi=None):
    try:
        if ccmslib_accession:
            url = f"https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{ccmslib_accession}"
        elif usi:
            url = f"https://metabolomics-usi.gnps2.org/json/?usi1={usi}"
        else:
            raise ValueError("Either ccmslib_accession or usi must be provided.")
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()
        return data['peaks']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [58]:
get_peaks_from_gnps("CCMSLIB00000579271")

[[79.20450592041016, 0.0],
 [79.20516967773438, 0.0],
 [79.20584106445312, 0.0],
 [84.07227325439453, 0.0],
 [84.07300567626953, 0.0],
 [84.07373809814453, 0.0],
 [84.074462890625, 0.0],
 [84.0751953125, 0.0],
 [84.075927734375, 0.0],
 [84.08100891113281, 61064960.0],
 [84.08689880371094, 1202383.625],
 [84.08829498291016, 0.0],
 [84.08902740478516, 0.0],
 [84.08975982666016, 0.0],
 [84.09049224853516, 0.0],
 [86.08783721923828, 0.0],
 [86.08859252929688, 0.0],
 [86.08935546875, 0.0],
 [86.0901107788086, 0.0],
 [86.09693145751953, 240203584.0],
 [86.10376739501953, 0.0],
 [86.10452270507812, 0.0],
 [86.10527801513672, 0.0],
 [86.10604095458984, 0.0],
 [91.96652221679688, 0.0],
 [91.96735382080078, 0.0],
 [91.96819305419922, 0.0],
 [91.96903228759766, 0.0],
 [91.96986389160156, 0.0],
 [91.970703125, 0.0],
 [91.97154235839844, 0.0],
 [91.97237396240234, 0.0],
 [91.97321319580078, 0.0],
 [91.97399139404297, 0.0],
 [91.9748306274414, 0.0],
 [91.97566986083984, 0.0],
 [91.97650146484375, 0.

In [63]:
import plotly.graph_objects as go

def plot_ms2_spectrum(peaks1, peaks2=None, width=2, label_top_n=10, max_y=float('inf')):
    """
    Plot an MS2 spectrum or mirror plot using Plotly.
    
    Parameters:
    - peaks1: List of (m/z, intensity) tuples for the first spectrum
    - peaks2: Optional second spectrum for mirror plot
    - width: Line width for the peaks
    - label_top_n: Number of top-intensity peaks in peaks1 to label
    - max_y: Maximum y-axis limit (for positive and negative)
    """
    fig = go.Figure()

    # Filter peaks with positive intensity
    filtered1 = [(x, y) for x, y in peaks1 if y > 0]
    if peaks2:
        filtered2 = [(x, y) for x, y in peaks2 if y > 0]

        # Normalize both spectra to 100 if mirror plot
        max1 = max((y for _, y in filtered1), default=1)
        max2 = max((y for _, y in filtered2), default=1)
        filtered1 = [(x, y / max1 * 100) for x, y in filtered1]
        filtered2 = [(x, y / max2 * 100) for x, y in filtered2]

    # Plot first spectrum (positive axis)
    for x, y in filtered1:
        fig.add_trace(go.Scatter(
            x=[x, x], y=[0, y],
            mode="lines",
            line=dict(color="blue", width=width),
            hoverinfo="text",
            text=[f"m/z: {x:.4f}<br>Intensity: {y:.2f}"] * 2,
            showlegend=False
        ))

    # Label top N peaks in first spectrum (no hover)
    top_peaks = sorted(filtered1, key=lambda p: p[1], reverse=True)[:label_top_n]
    for x, y in top_peaks:
        fig.add_trace(go.Scatter(
            x=[x], y=[y],
            mode="text",
            text=[f"{x:.2f}"],
            textposition="top center",
            hoverinfo="skip",
            showlegend=False
        ))

    # Plot second spectrum as mirror (negative axis)
    if peaks2:
        for x, y in filtered2:
            fig.add_trace(go.Scatter(
                x=[x, x], y=[0, -y],
                mode="lines",
                line=dict(color="red", width=width),
                hoverinfo="text",
                text=[f"m/z: {x:.4f}<br>Intensity: {y:.2f}"] * 2,
                showlegend=False
            ))

    # Determine y-axis range
    max1 = max([y for _, y in filtered1], default=0)
    max2 = max([y for _, y in filtered2], default=0) if peaks2 else 0
    y_max = min(max_y, max(max1, max2) * 1.1)

    # Dynamic y-axis label
    yaxis_label = "Intensity" if not peaks2 else "Intensity (Top) / -Intensity (Bottom)"

    # Layout
    fig.update_layout(
        title="MS2 Spectrum" if not peaks2 else "MS2 Mirror Plot (Normalized)",
        xaxis_title="m/z",
        yaxis_title=yaxis_label,
        yaxis=dict(range=[-y_max, y_max] if peaks2 else [0, y_max]),
        plot_bgcolor="white",
        height=500
    )

    return fig


In [65]:
plot_ms2_spectrum(get_peaks_from_gnps("CCMSLIB00000579271"), width=5, label_top_n=10)

In [64]:
plot_ms2_spectrum(get_peaks_from_gnps("CCMSLIB00000579271"),get_peaks_from_gnps("CCMSLIB00000579271"), width=5, label_top_n=10)

Search for similar spectra in the GNPS library using fasst

In [46]:
# https://wang-bioinformatics-lab.github.io/GNPS2_Documentation/api/#fast-search

r = requests.get("https://fasst.gnps2.org/search?library=gnpsdata_index&usi=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000579271")

In [47]:
r.json().keys()


dict_keys(['results', 'timing', 'grouped_by_unit_delta_mass', 'grouped_by_dataset', 'grouped_by_dataset_unit_delta', 'metadata_groupings', 'log'])

In [48]:
len(r.json()['results'])

341

In [49]:
r.json()['results'][0]

{'Delta Mass': 0.0,
 'USI': 'mzspec:MSV000082944:library/GNPS-LIBRARY.mgf:scan:3082',
 'Charge': 1,
 'Cosine': 1.0,
 'Matching Peaks': 18,
 'Unit Delta Mass': 0,
 'Dataset': 'MSV000082944',
 'Status': 'NoID',
 'Query Filename': 'temp/queries/9c/9c4f5495d6124d57a37aefd2e96dc33b/9c4f5495d6124d57a37aefd2e96dc33b.mgf',
 'Query Scan': 1,
 'Index UnitPM': 912,
 'Index IdxInUnitPM': 125500,
 'Filtered Input Spectrum Path': 'temp/queries/9c/9c4f5495d6124d57a37aefd2e96dc33b/0.json'}

In [70]:
r.json()['results'][0]['USI']

'mzspec:MSV000082944:library/GNPS-LIBRARY.mgf:scan:3082'

In [72]:
plot_ms2_spectrum(get_peaks_from_gnps(usi=r.json()['results'][0]['USI']),get_peaks_from_gnps("CCMSLIB00000579271"), width=5, label_top_n=10)

In [None]:
r.json()['results'][20]

{'Delta Mass': 0.0,
 'USI': 'mzspec:MSV000086314:ccms_peak/Microbes_LCMSMS/P309_LCMSMS.mzML:scan:2432',
 'Charge': 1,
 'Cosine': 0.95,
 'Matching Peaks': 16,
 'Unit Delta Mass': 0,
 'Dataset': 'MSV000086314',
 'Status': 'NoID',
 'Query Filename': 'temp/queries/9c/9c4f5495d6124d57a37aefd2e96dc33b/9c4f5495d6124d57a37aefd2e96dc33b.mgf',
 'Query Scan': 1,
 'Index UnitPM': 912,
 'Index IdxInUnitPM': 121566,
 'Filtered Input Spectrum Path': 'temp/queries/9c/9c4f5495d6124d57a37aefd2e96dc33b/0.json'}

In [74]:
plot_ms2_spectrum(get_peaks_from_gnps(usi=r.json()['results'][20]['USI']),get_peaks_from_gnps("CCMSLIB00000579271"), width=5, label_top_n=10)

Get full file from GNPS



In [89]:
def download_file(url, local_filename):
    if Path(local_filename).exists():
        raise FileExistsError(f"{local_filename} already exists. Please choose a different name or delete the existing file.")
    # Stream the download to avoid loading the entire file into memory
    with requests.get(url, stream=True) as r:
        r.raise_for_status()  # Check for HTTP errors
        total_size = int(r.headers.get('content-length', 0))
        with open(local_filename, 'wb') as f:
            for chunk in tqdm(r.iter_content(chunk_size=81920), total=total_size // 81920, unit='KB'):
                f.write(chunk)
    return local_filename


def resolve_file_from_gnps(usi):
    """
    Resolve a file from GNPS using its USI.
    
    Parameters:
    - usi: The USI of the file to resolve.
    - filename: The name to save the file as.
    Returns:
    - a string url
    
    """
    url = f"https://dashboard.gnps2.org/downloadlink?usi={usi}"
    with requests.get(url) as r:
        r.raise_for_status()
        return r.text

In [82]:
resolve_file_from_gnps(r.json()['results'][20]['USI'])

'https://massiveproxy.gnps2.org/massiveproxy/MSV000086314/ccms_peak/Microbes_LCMSMS/P309_LCMSMS.mzML'

In [91]:
url = resolve_file_from_gnps(r.json()['results'][20]['USI'])
filename = Path(url).name
filename = Path("../data", filename)

download_file(url, filename)

1384KB [00:40, 34.46KB/s]                          


PosixPath('../data/P309_LCMSMS.mzML')

In [101]:
!head -n20 '../data/P309_LCMSMS.mzML'

<?xml version="1.0" encoding="utf-8"?>
<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">
  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="spec-00045" version="1.1.0">
    <cvList count="2">
      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="4.1.41" URI="https://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo"/>
      <cv id="UO" fullName="Unit Ontology" version="09:04:2014" URI="https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/master/unit.obo"/>
    </cvList>
    <fileDescription>
      <fileContent>
        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>
        <cvPa

For info on mzML see: https://code4np.github.io/posts/intro_to_mass_spec/2_mzml

In [None]:
https://gnps2.org/dashboards/embedded/resolver/?usi1=CCMSLIB00000579271&usi2=CCMSLIB00012471599&tolerance=0.5

In [None]:
df =msql_fileloading.load_data("../data/SF.mzXML")
https://fasst.gnps2.org/search?library=gnpslibrary&usi=\
https://fasst.gnps2.org/search?library=gnpslibrary&usi=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000579271


mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000579271
mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00012471599


https://dashboard.gnps2.org/downloadlink?usi=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000579271&amp
ftp://ccms-ftp.ucsd.edu/GNPS_Library_Provenance/2590d89de473433c9bf13a5e47ee789b/Pure_Surugamide_Diluted.mzXML;

In [None]:
df[0]


In [None]:
# get precmz greater and lesser + than  912.628636
df[1][(df[1]['precmz'] > 912.628636 - 1) & (df[1]['precmz'] < 912.628636 + 1)]

In [None]:
# get unique precmz values
df[1]['precmz'].unique()

In [None]:

def download_file(url, local_filename):
    # Stream the download to avoid loading the entire file into memory
    with requests.get(url, stream=True) as r:
        r.raise_for_status()  # Check for HTTP errors
        total_size = int(r.headers.get('content-length', 0))
        with open(local_filename, 'wb') as f:
            for chunk in tqdm(r.iter_content(chunk_size=8192), total=total_size // 8192, unit='KB'):
                f.write(chunk)
    return local_filename

download_file("https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000079618%2Fccms_peak%2FSF.mzXML", "../data/SF.mzXML")

In [None]:
msql_fileloading.load_data("data/SF.mzXML")

In [None]:

def calc_ppm(mass, ppm_tolerance):
    return mass * ppm_tolerance / 1e6


def process_file(file, metabolites, ion_mass_map, ppm_tolerance, basedir, level, output_dir):
    ms1_df, ms2_df = msql_fileloading.load_data(str(file))
    relative_filename = file.relative_to(basedir)
    result_dfs = []
    for _, row in metabolites.iterrows():
        for species, (species_mass, species_mult) in ion_mass_map.items():
            expected_mass = calc_expected_mass(row['monoisotopic_mass'], species_mass, species_mult)
            lower = expected_mass - calc_ppm(expected_mass, ppm_tolerance)
            upper = expected_mass + calc_ppm(expected_mass, ppm_tolerance)
            if level == 1:
                results_df = ms1_df[(ms1_df['mz'] >= lower) & (ms1_df['mz'] <= upper)]
            elif level == 2:
                results_df= ms2_df[(ms2_df['precmz'] >= lower) & (ms2_df['precmz'] <= upper)]
            results_df = results_df.copy()
            if not results_df.empty:
                results_df["species"] = species
                results_df["expected_mass"] = expected_mass
                results_df["filename"] = str(relative_filename)
                results_df = results_df.round({'i': 4, 'i_norm': 4, 'i_tic_norm': 4, 'mz': 4, 'rt': 4, 'expected_mass': 4})
                file_path = Path(output_dir, f"{str(row['pubchem_cid'])}_{str(row['metabolite_of_interest'])}_mslevel-{str(level)}.tsv")
                results_df.to_csv(file_path, sep="\t", index=False, mode='a', header=not file_path.exists())
    