In [1]:
import pandas as pd

# Read the CSV file
data = pd.read_csv('PA_observations.csv', names=['Protein', 'Paref', 'Sequence'])

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,Protein,Paref,Sequence
0,Protein,Paref,Sequence
1,A0A075B6H7,PAp10045845,LLIYGGSTR
2,A0A075B6H7,PAp10440458,ASQSVSSNYLTWYQQKPGQAPR
3,A0A075B6H7,PAp10107447,EIVMTQSPSTLSLSPGER
4,A0A075B6H7,PAp10438654,ASQSVSSTYLTWYQQKPGQAPR


In [2]:
import os
import sys
from multiprocessing import Pool, cpu_count

import requests
from bs4 import BeautifulSoup


# Function to create the correct PeptideAtlas URL for a given peptide
def create_peptide_url(peptide_seq):
    base_url = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptide"
    url = f"{base_url}?_tab=3&atlas_build_id=572&searchWithinThis=Peptide+Sequence&searchForThis={peptide_seq}&action=QUERY"
    return url

In [3]:
# Generate URLs for all sequences in the data
data['URL'] = data['Sequence'].apply(create_peptide_url)

# Display the first few rows with the new URL column
data.head()

Unnamed: 0,Protein,Paref,Sequence,URL
0,Protein,Paref,Sequence,https://db.systemsbiology.net/sbeams/cgi/Pepti...
1,A0A075B6H7,PAp10045845,LLIYGGSTR,https://db.systemsbiology.net/sbeams/cgi/Pepti...
2,A0A075B6H7,PAp10440458,ASQSVSSNYLTWYQQKPGQAPR,https://db.systemsbiology.net/sbeams/cgi/Pepti...
3,A0A075B6H7,PAp10107447,EIVMTQSPSTLSLSPGER,https://db.systemsbiology.net/sbeams/cgi/Pepti...
4,A0A075B6H7,PAp10438654,ASQSVSSTYLTWYQQKPGQAPR,https://db.systemsbiology.net/sbeams/cgi/Pepti...


In [4]:
# Function to fetch and parse the table from a given URL
def fetch_table_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        spectra_table = soup.find('table', id='individual_spectra')  # Find the table with id "individual_spectra"
        dataset_table = soup.find_all('table')[-1]  # Select the last table on the site
        if spectra_table:
            # Extract table headers
            # headers = [header.text.strip() for header in spectra_table.find_all('th')]
            # headers = ['Modified Sequence','Charge','Expt','Spectrum Name']
            # Extract table rows
            rows = []
            for row in spectra_table.find_all('tr')[1:]:  # Skip the header row
                if 'Table truncated' in row.text:
                    continue
                cells = [cell.text.strip() for cell in row.find_all('td')]
                if len(cells) > 2:
                    expt = cells[2]
                else:
                    expt = None
                    print(f"Expt is None for row: {row}")
                    print(f"URL: {url}")
                expt = expt.replace(',', '')  # Remove all commas from expt
                dataset = None
                if dataset_table:
                    dataset_rows = dataset_table.find_all('tr')[1:]  # Skip the header row
                    for dataset_row in dataset_rows:
                        dataset_cells = [cell.text.strip() for cell in dataset_row.find_all('td')]
                        if dataset_cells and dataset_cells[0] == expt:
                            dataset = dataset_cells[1]

                            if dataset:
                                if ',' in dataset:
                                    dataset = next((d for d in dataset.split(',') if d.startswith('MSV') or d.startswith('PXD')), None)
                                elif not (dataset.startswith('MSV') or dataset.startswith('PXD')):
                                    dataset = None
                            break
                if hasattr(cells[0], 'contents'):
                    modified_seq = ''.join([str(content) if content.name != 'span' else content.text for content in cells[0].contents])
                else:
                    modified_seq = cells[0]  # Fallback to the cell's text if 'contents' attribute is not present
                rows.append([modified_seq, cells[1], expt, cells[5],dataset])
            return rows
    return None, None


In [5]:
# Example: Fetch and display the table for the first URL in the data
all_data = []
for url in data['URL'][11851:]:  # Start from the second URL (index 1)
    rows = fetch_table_from_url(url)
    print(f"Processing URL: {url} ({data['URL'].tolist().index(url) + 1}/{len(data['URL'])})")
    if rows:
        for row in rows:
            print(row)
            modified_seq, charge, expt, spectrum_name, dataset = row
            if spectrum_name and dataset:
                file_name = spectrum_name.split('.')[0]
                scan_number = spectrum_name.split('.')[1]
                usi = f"mzspec:{dataset}:{file_name}:scan:{scan_number}:{modified_seq}/{charge}"
                all_data.append(usi)

    # Append all generated USIs to the file
    with open('all_usi.txt', 'a') as f:
        for usi in all_data:
            f.write(usi + '\n')
    all_data.clear()  # Clear the list to avoid duplicate entries in the file





Processing URL: https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptide?_tab=3&atlas_build_id=572&searchWithinThis=Peptide+Sequence&searchForThis=MTPDQALKH&action=QUERY (11852/12615)
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R2_01.10022.10022.2', 'MSV000084172']
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R2_02.10164.10164.2', 'MSV000084172']
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R2_01.10082.10082.2', 'MSV000084172']
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R2_02.10108.10108.2', 'MSV000084172']
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3IPs_a50mio_R1_01.09311.09311.2', 'MSV000084172']
['MTPDQALKH', '2', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R1_02.09992.09992.2', 'MSV000084172']
['MTPDQALKH', '3', '10000', 'YE_20180516_SK_HLA_A3402_3IPs_a50mio_R1_01.09329.09329.3', 'MSV000084172']
['MTPDQALKH', '3', '10000', 'YE_20180516_SK_HLA_A3402_3Ips_a50mio_R2_02.10161.10