# MSGF VS PeptideAtlas Reanalysis

#### reqired files:
###### 1 all_usi.txt : all peptide atlas usi found
###### 2 MassIVE-KB_HPP_proteins.tsv: all MassIVE proteins
###### 3 all ambituity files on same directory eg. MSV000096271
###### 4 fasta_file = 'uniprotkb_human_proteome_UP000005640_with_isoforms_2024-10-08.fasta'
###### 5 PA_observations.csv peptide atlas all peptides



#### Merge ambiguity files --> ambiguity_merged.tsv

In [None]:
import os
import shutil
import pandas as pd

# Directory containing the TSV files
directory = 'c:/Users/xuech/Desktop/UCSD/grad/ccmc/proteinDataRefine/Evaluate_reanalysis/MSV000086793/' #TODO

# Output file path
output_file = os.path.join(directory, 'ambiguity_merged.tsv')

# Archive directory
archive_directory = os.path.join(directory, 'ambiguity_archive')

# Create the archive directory if it doesn't exist
if not os.path.exists(archive_directory):
    os.makedirs(archive_directory)

# Remove the output file if it already exists
# if os.path.exists(output_file):
#     os.remove(output_file)

# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.startswith('MSGF-PLUS-AMBIGUITY') and filename.endswith('.tsv'):
        print(f'Processing file: {filename}')
        filepath = os.path.join(directory, filename)
        try:
            df = pd.read_csv(filepath, sep='\t', low_memory=False)
        except pd.errors.EmptyDataError:
            print(f'Skipping empty file: {filename}')
            continue
        # Append to the output file
        df.to_csv(output_file, sep='\t', index=False, mode='a', header=not os.path.exists(output_file))
        # Move the processed file to the archive directory
        shutil.move(filepath, os.path.join(archive_directory, filename))

#### ambiguity_merged.tsv --> filtered.tsv

In [1]:
import pandas as pd

# Load the datasets in chunks
chunksize = 10000
proteins_to_skip_file = 'MassIVE-KB_HPP_proteins.tsv'
with open(proteins_to_skip_file, 'r') as file:
    massive_kb_proteins = set(line.strip() for line in file)

filtered_chunks = []
for chunk in pd.read_csv('ambiguity_merged.tsv', sep='\t', low_memory=False, error_bad_lines=False, chunksize=chunksize):
    # Process the 'opt_global_TopCanonicalProtein' column to extract the protein ID

    chunk['opt_global_TopCanonicalProtein'] = chunk['opt_global_TopCanonicalProtein'].apply(
        lambda x: x.split('|')[1] if isinstance(x, str) and '|' in x else x
    )
    # Filter the chunk
    filtered_chunk = chunk[~chunk['opt_global_TopCanonicalProtein'].isin(massive_kb_proteins)]
    filtered_chunks.append(filtered_chunk)

# Concatenate all filtered chunks
filtered_reanalysis_df = pd.concat(filtered_chunks)

# Save the filtered dataframe to a new file
filtered_reanalysis_df.to_csv('filtered.tsv', sep='\t', index=False)



  for chunk in pd.read_csv('ambiguity_merged.tsv', sep='\t', low_memory=False, error_bad_lines=False, chunksize=chunksize):
Skipping line 2042216: expected 183 fields, saw 197
Skipping line 2042217: expected 183 fields, saw 197
Skipping line 2042218: expected 183 fields, saw 197
Skipping line 2042219: expected 183 fields, saw 197
Skipping line 2042220: expected 183 fields, saw 197
Skipping line 2042221: expected 183 fields, saw 197
Skipping line 2042222: expected 183 fields, saw 197
Skipping line 2042223: expected 183 fields, saw 197
Skipping line 2042224: expected 183 fields, saw 197
Skipping line 2042225: expected 183 fields, saw 197
Skipping line 2042226: expected 183 fields, saw 197
Skipping line 2042227: expected 183 fields, saw 197
Skipping line 2042228: expected 183 fields, saw 197
Skipping line 2042229: expected 183 fields, saw 197
Skipping line 2042230: expected 183 fields, saw 197
Skipping line 2042231: expected 183 fields, saw 197
Skipping line 2042232: expected 183 fields

## spectrum level

#### all_usi.txt + filtered.tsv --> example_reanalysis_spectrum.tsv

In [None]:
import pandas as pd
import re

# Read the PeptideAtlas USIs from merged.txt
usi_file = 'all_usi.txt'
usi_data = []
with open(usi_file, 'r') as file:
    for line in file:
        parts = line.strip().split(':')
        if len(parts) == 6:
            dataset, spectrum_file, scan, peptide_identification, peptide_charge = parts[1], parts[2], parts[4], parts[5].split('/')[0], parts[5].split('/')[1]
            usi_data.append([line.strip(), dataset, spectrum_file, scan, peptide_identification, peptide_charge])

usi_df = pd.DataFrame(usi_data, columns=['USI', 'Dataset', 'Spectrum_File', 'Scan_Number', 'Peptide_Identification', 'Peptide_Charge'])
usi_df.to_csv('df_print.tsv', sep='\t', index=False)

# Create a dictionary for quick lookup
usi_dict = {}
for _, row in usi_df.iterrows():
    key = (row['Spectrum_File'], row['Scan_Number'])
    usi_dict[key] = row

# Read the reanalysis results from MSGF-PLUS-AMBIGUITY-81a33a88-group_by_spectrum-main.tsv
reanalysis_file = 'filtered.tsv'
reanalysis_df = pd.read_csv(reanalysis_file, sep='\t', low_memory=False)

# Initialize columns for the output DataFrame
reanalysis_df.insert(0, 'PeptideAtlas_USI', '')
reanalysis_df.insert(1, 'PeptideAtlas_peptide', '')
reanalysis_df.insert(2, 'PeptideAtlas_peptide_demod', '')
reanalysis_df.insert(3, 'Peptide_match', '')
reanalysis_df.insert(4, 'PeptideAtlas_charge', '')


# Match spectra from the PeptideAtlas USIs lists to the reanalysis results
for index, row in reanalysis_df.iterrows():
    original_filepath = row['opt_global_OriginalFilepath']
    scan_number = str(row['opt_global_scan'])
    
    # Extract the spectrum file name from the original file path and remove the .mzML extension if present
    spectrum_file = original_filepath.split('/')[-1].replace('.mzML', '')
    
    key = (spectrum_file, scan_number)
    if key in usi_dict:
        usi_row = usi_dict[key]
        reanalysis_df.at[index, 'PeptideAtlas_USI'] = usi_row['USI']
        reanalysis_df.at[index, 'PeptideAtlas_peptide'] = usi_row['Peptide_Identification']
        # Remove all substrings like "[*]" from PeptideAtlas_peptide
        peptide_demod = re.sub(r'\[.*?\]', '', usi_row['Peptide_Identification']).replace('-', '')
        reanalysis_df.at[index, 'PeptideAtlas_peptide_demod'] = peptide_demod
        
        # Set Peptide_match to 1 if PeptideAtlas_peptide_demod matches opt_global_UnmodPep, otherwise 0
        reanalysis_df.at[index, 'Peptide_match'] = 1 if peptide_demod == row['opt_global_UnmodPep'] else 0
        reanalysis_df.at[index, 'PeptideAtlas_charge'] = usi_row['Peptide_Charge']
        #print(f'Matched file: {usi_row["Spectrum_File"]}, Scan number: {usi_row["Scan_Number"]}')
    
    if index % 1000 == 0:
        print(f'Processed {index} rows.')

    # Identify the datasets and spectrum files that have at least one match
    # matched_datasets = set()
matched_spectrum_files = set()
matched_scans = set()

for index, row in reanalysis_df.iterrows():
    if row['PeptideAtlas_USI']:
        usi_parts = row['PeptideAtlas_USI'].split(':')
        if len(usi_parts) == 6:
            dataset = usi_parts[1].replace('.mzML', '')
            spectrum_file = usi_parts[2]
            matched_spectrum_files.add(spectrum_file)
            matched_scans.add((spectrum_file, usi_parts[4]))
print(f'Matched {len(matched_spectrum_files)} spectrum files and {len(matched_scans)} scans.')

# Add empty USIs for unmatched spectra all at once
new_rows = []
for key, usi_row in usi_dict.items():
    spectrum_file, scan_number = key
    if usi_row['Dataset'] == 'MSV000086793' and spectrum_file in matched_spectrum_files and key not in matched_scans: #TODO
        new_rows.append({
            'PeptideAtlas_USI': usi_row['USI'],
            'PeptideAtlas_peptide': usi_row['Peptide_Identification'],
            'PeptideAtlas_peptide_demod': re.sub(r'\[.*?\]', '', usi_row['Peptide_Identification']).replace('-', ''),
            'Peptide_match': 0,
            'PeptideAtlas_charge': usi_row['Peptide_Charge'],
            'opt_global_OriginalFilepath': '',
            'opt_global_scan': '',
            'opt_global_UnmodPep': ''
        })

# Append all new rows to the DataFrame at once
if new_rows:
    reanalysis_df = pd.concat([reanalysis_df, pd.DataFrame(new_rows)], ignore_index=True)


output_file = 'example_reanalysis_spectrum.tsv'
reanalysis_df.to_csv(output_file, sep='\t', index=False)

Processed 0 rows.
Processed 1000 rows.
Processed 2000 rows.
Processed 3000 rows.
Processed 4000 rows.
Processed 5000 rows.
Processed 6000 rows.
Processed 7000 rows.
Processed 8000 rows.
Processed 9000 rows.
Processed 10000 rows.
Processed 11000 rows.
Processed 12000 rows.
Processed 13000 rows.
Processed 14000 rows.
Processed 15000 rows.
Processed 16000 rows.
Processed 17000 rows.
Processed 18000 rows.
Processed 19000 rows.
Processed 20000 rows.
Processed 21000 rows.
Processed 22000 rows.
Processed 23000 rows.
Processed 24000 rows.
Processed 25000 rows.
Processed 26000 rows.
Processed 27000 rows.
Processed 28000 rows.
Processed 29000 rows.
Processed 30000 rows.
Processed 31000 rows.
Processed 32000 rows.
Processed 33000 rows.
Processed 34000 rows.
Processed 35000 rows.
Processed 36000 rows.
Processed 37000 rows.
Processed 38000 rows.
Processed 39000 rows.
Processed 40000 rows.
Processed 41000 rows.
Processed 42000 rows.
Processed 43000 rows.
Processed 44000 rows.
Processed 45000 rows.
P

: 

## Peptide Level

#### example_reanalysis_spectrum.tsv -> example_reanalysis_peptide.tsv

In [3]:
#multiprocessing doeson't work here, please directly call compare_reanalyze_peptide.py in terminal or copy and run code below

# import pandas as pd
# from Bio import SeqIO
# from multiprocessing import Pool, cpu_count
# import os

# # Load the data
# chunk_size = 10000
# fasta_file = 'uniprotkb_human_proteome_UP000005640_with_isoforms_2024-10-08.fasta'

# # Define a function to get the peptide sequence
# def get_peptide_sequence(row):
#     return row['PeptideAtlas_peptide_demod'] if pd.notna(row['PeptideAtlas_peptide_demod']) else row['opt_global_UnmodPep']

# # Define a function to get the peptide charge
# def get_peptide_charge(row):
#     return row['PeptideAtlas_charge'] if pd.notna(row['PeptideAtlas_charge']) else row['charge']

# # Define a function to get the protein ID from the sequence column "tr|D9J307|D9J307_HUMAN"
# def get_protein_id_from_msgf(row):
#     accession = row['accession']
#     if isinstance(accession, str) and '|' in accession:
#         parts = accession.split('|')
#         if len(parts) > 1:
#             return parts[1]
#     return None

# # Load protein sequences from fasta file
# # Pre-compute and store sequences in a dictionary for faster lookups
# protein_sequences = {}
# for record in SeqIO.parse(fasta_file, "fasta"):
#     gene_id = next((part.split('=')[1] for part in record.description.split() if part.startswith('GN=')), 'UNKNOWN')
#     protein_sequences[record.id] = {
#         'gene_id': gene_id,
#         'sequence': str(record.seq).replace('I', 'L')
#     }

# # Function to count matches in protein sequences
# def count_matches(peptide, allow_mutation=False):
#     peptide = peptide.replace('I', 'L')
#     protein_ids = set()
#     gene_ids = set()
#     protein_list = []
#     gene_list = []
    
#     for header, data in protein_sequences.items():
#         gene_id = data['gene_id']
#         sequence = data['sequence']
#         protein_id = header.split('|')[1] if '|' in header else header
        
#         if allow_mutation:
#             # Check for near-matches (SAAP)
#             for i in range(len(sequence) - len(peptide) + 1):
#                 window = sequence[i:i+len(peptide)]
#                 if sum(1 for a, b in zip(peptide, window) if a != b) <= 1:
#                     protein_ids.add(protein_id)
#                     gene_ids.add(gene_id)
#                     break
#         else:
#             # Check for exact matches
#             if peptide in sequence:
#                 protein_ids.add(protein_id)
#                 gene_ids.add(gene_id)
    
#     # Return counts and lists
#     return (
#         len(protein_ids), 
#         len(gene_ids), 
#         ';'.join(protein_ids) if protein_ids else 'None', 
#         ';'.join(gene_ids) if gene_ids else 'UNKNOWN'
#     )

# # Function to process a peptide
# def process_peptide(to_parallel_process):
#     peptide, peptide_data, peptideatlas_df = to_parallel_process
    
#     # Get exact matches and SAAP matches
#     num_proteins, num_genes, list_proteins, list_genes = count_matches(peptide)
#     num_proteins_saap, num_genes_saap, list_proteins_saap, list_genes_saap = count_matches(peptide, allow_mutation=True)
    
#     # Build the output row
#     peptide_row = {
#         'Peptide sequence': peptide,
#         'Peptide charge': peptide_data.apply(get_peptide_charge, axis=1).iloc[0],
#         'Protein identifier': peptide_data.apply(get_protein_id_from_msgf, axis=1).iloc[0],
#         'Num_specs_both': len(peptide_data[(pd.notna(peptide_data['PeptideAtlas_USI'])) & (pd.notna(peptide_data['sequence']))]),
#         'Num_specs_MSGF': len(peptide_data[(pd.isna(peptide_data['PeptideAtlas_USI'])) & (pd.notna(peptide_data['sequence']))]),
#         'Num_specs_PA': len(peptide_data[(pd.notna(peptide_data['PeptideAtlas_USI'])) & (pd.isna(peptide_data['sequence']))]),
#         'PA_peptide': 1 if not peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide].empty else 0,
#         'PA_psms': len(peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide]),
#         'Num_proteins': num_proteins,
#         'List_proteins': list_proteins,
#         'Num_genes': num_genes,
#         'List_genes': list_genes,
#         'Num_proteins_saap': num_proteins_saap,
#         'List_proteins_saap': list_proteins_saap,
#         'Num_genes_saap': num_genes_saap,
#         'List_genes_saap': list_genes_saap
#     }
#     #print(peptide_row)
#     return peptide_row
#     # Function to process an existing peptide
# def process_existing_peptide(peptide, peptide_data, peptideatlas_df):
#     # Build the output row without counting matches
#     peptide_row = {
#         'Peptide sequence': peptide,
#         'Peptide charge': peptide_data.apply(get_peptide_charge, axis=1).iloc[0],
#         'Protein identifier': peptide_data.apply(get_protein_id_from_msgf, axis=1).iloc[0],
#         'Num_specs_both': len(peptide_data[(pd.notna(peptide_data['PeptideAtlas_USI'])) & (pd.notna(peptide_data['sequence']))]),
#         'Num_specs_MSGF': len(peptide_data[(pd.isna(peptide_data['PeptideAtlas_USI'])) & (pd.notna(peptide_data['sequence']))]),
#         'Num_specs_PA': len(peptide_data[(pd.notna(peptide_data['PeptideAtlas_USI'])) & (pd.isna(peptide_data['sequence']))]),
#         'PA_peptide': 1 if not peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide].empty else 0,
#         'PA_psms': len(peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide]),
#         'Num_proteins': None,
#         'List_proteins': None,
#         'Num_genes': None,
#         'List_genes': None,
#         'Num_proteins_saap': None,
#         'List_proteins_saap': None,
#         'Num_genes_saap': None,
#         'List_genes_saap': None
#     }
#     print(f"Existing Peptide ID: {peptide_row['Peptide sequence']}")
#     return peptide_row
# # Load the PeptideAtlas data
# peptideatlas_df = pd.read_csv('PeptideAtlas_peptides.tsv', sep='\t')

# # Use a set to keep track of unique peptides
# unique_peptides_set = set()

# if __name__ == "__main__":
#     # Open the output file
#     peptide_file_exists = os.path.exists('example_reanalysis_peptide.tsv')
#     with open('example_reanalysis_peptide.tsv', 'a+') as output_file:
#         # Write the header
#         if not peptide_file_exists:
#             output_file.write('\t'.join([
#                 'Peptide sequence', 'Peptide charge', 'Protein identifier', 'Num_specs_both', 'Num_specs_MSGF', 'Num_specs_PA',
#                 'PA_peptide', 'PA_psms', 'Num_proteins', 'List_proteins', 'Num_genes', 'List_genes', 'Num_proteins_saap',
#                 'List_proteins_saap', 'Num_genes_saap', 'List_genes_saap'
#             ]) + '\n')
        
#         # Read the spectrum file in chunks
#         # Check if the file exists
#         if peptide_file_exists:
#             # Read the existing peptides from the file
#             with open('example_reanalysis_peptide.tsv', 'r') as existing_file:
#                 # Skip the header
#                 next(existing_file)
#                 for line in existing_file:
#                     peptide = line.split('\t')[0]  # Extract the peptide sequence (first column)
#                     unique_peptides_set.add(peptide)
#         print(f"Number of unique peptides added: {len(unique_peptides_set)}")
#         for chunk in pd.read_csv('example_reanalysis_spectrum.tsv', sep='\t', chunksize=chunk_size):
#             chunk['Peptide sequence'] = chunk.apply(get_peptide_sequence, axis=1)
#             counter = 0
#             total_peptides = len(chunk['Peptide sequence'].unique())

#             def update_counter(result):
#                 global counter
#                 counter += 1
#                 # print(f"Processed {counter}/{total_peptides} peptides")

#             with Pool(cpu_count()) as pool:
#                 peptides_to_process = [
#                     (peptide, chunk[chunk['Peptide sequence'] == peptide], peptideatlas_df)
#                     for peptide in chunk['Peptide sequence'].unique()
#                 ]
                
#                 results = []
#                 to_parallel_process = []
#                 not_parallel_process = []

#                 for peptide, peptide_data, pa_df in peptides_to_process:
#                     if peptide not in unique_peptides_set:
#                         #print(f"Appending New Peptide: {peptide}")
#                         unique_peptides_set.add(peptide)
#                         to_parallel_process.append((peptide, peptide_data, pa_df))
#                     else:
#                         #print(f"Processing Existing Peptide: {peptide}")
#                         not_parallel_process.append((peptide, peptide_data, pa_df))

#                 # Parallel process the new peptides
#                 if to_parallel_process:
#                     for result in pool.imap_unordered(process_peptide, to_parallel_process):
#                         results.append(result)
#                         update_counter(result)
#                         print(f"Processed {counter}/{total_peptides} peptides in chunk {chunk.index[0] // chunk_size + 1}")

#                 # Process the existing peptides
#                 for peptide, peptide_data, pa_df in not_parallel_process:
#                     results.append(process_existing_peptide(peptide, peptide_data, pa_df))
#                     update_counter(None)
#                     print(f"Processed {counter}/{total_peptides} peptides in chunk {chunk.index[0] // chunk_size + 1}")
                    
#                 for peptide_row in results:
#                     peptide_sequence = peptide_row['Peptide sequence']
                    
#                     # Read the current output file content
#                     output_file.seek(0)
#                     lines = output_file.readlines()
#                     print()
#                     # Check if the peptide is already in the file
#                     found = False
#                     for i, line in enumerate(lines):
#                         if line.startswith(peptide_sequence):
#                             found = True
#                             existing_data = line.strip().split('\t')
                            
#                             # Update the existing line with new data
#                             existing_data[3] = str(int(existing_data[3]) + peptide_row['Num_specs_both'])
#                             existing_data[4] = str(int(existing_data[4]) + peptide_row['Num_specs_MSGF'])
#                             existing_data[5] = str(int(existing_data[5]) + peptide_row['Num_specs_PA'])
#                             existing_data[6] = str(int(existing_data[6]) + peptide_row['PA_peptide'])
#                             existing_data[7] = str(int(existing_data[7]) + peptide_row['PA_psms'])
                            
#                             # Write the updated line back to the file
#                             lines[i] = '\t'.join(existing_data) + '\n'
#                             break
                    
#                     if not found:
#                         # Append the new peptide row to the file
#                         lines.append('\t'.join(map(str, peptide_row.values())) + '\n')
                    
#                     # Write the updated content back to the file
#                     output_file.seek(0)
#                     output_file.truncate()
#                     output_file.writelines(lines)

#                 pool.close()
#                 pool.join()

#             print(f'Processed chunk {chunk.index[0] // chunk_size + 1}')

#### fix peptide number

In [3]:
import pandas as pd
import re

# Load the peptide and spectrum data
peptide_data_path = "example_reanalysis_peptide.tsv"
spectrum_data_path = "example_reanalysis_spectrum.tsv"
peptideatlas_df = pd.read_csv('PeptideAtlas_peptides.tsv', sep="\t")
usi_file = 'all_usi.txt'

with open(usi_file, 'r') as file:
    peptides_pa = [re.sub(r'\[.*?\]', '', line.strip().split(':')[-1].split('/')[0].replace('-', '')) for line in file if len(line.strip().split(':')) == 6]

peptide_df = pd.read_csv(peptide_data_path, sep="\t")
spectrum_df = pd.read_csv(spectrum_data_path, sep="\t")


def get_protein_id_from_pa(row):
    peptide_sequence = row['Peptide sequence']
    matching_rows = peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide_sequence]
    if not matching_rows.empty:
        return matching_rows.iloc[0, 0]  # Return the protein ID from column 0
    return None
output_df = pd.DataFrame()

# Function to update the specified columns
def update_peptide_data(peptide_df, spectrum_df):
    # Iterate through each peptide in the peptide dataframe
    pa_observations_df = pd.read_csv('PA_observations.csv')
    for index, row in peptide_df.iterrows():
        # Print progress update
        if index % 100 == 0:
            print(f"Processing peptide {index+1}/{len(peptide_df)} ({(index+1)/len(peptide_df)*100:.1f}%)")
        peptide_sequence = row['Peptide sequence']
        
        # Filter spectrum data for the current peptide
        spectrum_subset = spectrum_df[(spectrum_df['sequence'] == peptide_sequence) | (spectrum_df['PeptideAtlas_peptide_demod'] == peptide_sequence)]
        
        # Update the columns
        num_specs_both = len(spectrum_subset[(pd.notna(spectrum_subset['PeptideAtlas_USI'])) & (pd.notna(spectrum_subset['sequence']))])
        num_specs_msgf = len(spectrum_subset[(pd.isna(spectrum_subset['PeptideAtlas_USI'])) & (pd.notna(spectrum_subset['sequence']))])
        num_specs_pa = len(spectrum_subset[(pd.notna(spectrum_subset['PeptideAtlas_USI'])) & (pd.isna(spectrum_subset['sequence']))])
        pa_peptide = 0

        if peptide_sequence in pa_observations_df['Sequence'].values:
            pa_peptide = 1
        # Count PSMs from all_usi.txt
        pa_psms = peptides_pa.count(peptide_sequence)
        # try:
        #     with open('all_usii.txt', 'r') as usi_file:
        #         for line in usi_file:
        #             usi = line.strip()
        #             # Extract and demodify the peptide sequence from the USI
        #             if ":" not in usi or "/" not in usi:
        #                 continue
                    
        #             # Extract sequence part between last colon and slash
        #             parts = usi.split(':')
        #             if len(parts) < 2:
        #                 continue
                    
        #             seq_part = parts[-1].split('/')[0]
                    
        #             # Remove modifications (text in square brackets)
        #             demod_seq = re.sub(r'\[.*?\]', '', seq_part)
                    
        #             # If peptide matches the current peptide sequence
        #             if demod_seq == peptide_sequence:
        #                 pa_psms += 1
        # except FileNotFoundError:
        # pa_psms = len(spectrum_subset[spectrum_subset['sequence'] == peptide_sequence])
        
        # Update the peptide dataframe
        
        protein_identifier = row['Protein identifier']
        if isinstance(protein_identifier, str):
            peptide_df.at[index, 'Protein identifier MSGF'] = protein_identifier.split('-')[0]
        else:
            peptide_df.at[index, 'Protein identifier MSGF'] = None
        peptide_df.at[index, 'Protein identifier PA'] = ";".join(peptideatlas_df[peptideatlas_df.iloc[:, 5] == peptide_sequence].iloc[:, 0].dropna().unique())
        peptide_df.at[index, 'Num_specs_both'] = num_specs_both
        peptide_df.at[index, 'Num_specs_MSGF'] = num_specs_msgf
        peptide_df.at[index, 'Num_specs_PA'] = num_specs_pa
        peptide_df.at[index, 'PA_peptide'] = pa_peptide
        peptide_df.at[index, 'PA_psms'] = pa_psms

    return peptide_df

# Update the peptide data
updated_peptide_df = update_peptide_data(peptide_df, spectrum_df)

# Save the updated dataframe
updated_peptide_df.to_csv("example_peptide_reanalysis_updated.tsv", sep="\t", index=False)

  spectrum_df = pd.read_csv(spectrum_data_path, sep="\t")


Processing peptide 1/12060 (0.0%)
Processing peptide 101/12060 (0.8%)
Processing peptide 201/12060 (1.7%)
Processing peptide 301/12060 (2.5%)
Processing peptide 401/12060 (3.3%)
Processing peptide 501/12060 (4.2%)
Processing peptide 601/12060 (5.0%)
Processing peptide 701/12060 (5.8%)
Processing peptide 801/12060 (6.6%)
Processing peptide 901/12060 (7.5%)
Processing peptide 1001/12060 (8.3%)
Processing peptide 1101/12060 (9.1%)
Processing peptide 1201/12060 (10.0%)
Processing peptide 1301/12060 (10.8%)
Processing peptide 1401/12060 (11.6%)
Processing peptide 1501/12060 (12.4%)
Processing peptide 1601/12060 (13.3%)
Processing peptide 1701/12060 (14.1%)
Processing peptide 1801/12060 (14.9%)
Processing peptide 1901/12060 (15.8%)
Processing peptide 2001/12060 (16.6%)
Processing peptide 2101/12060 (17.4%)
Processing peptide 2201/12060 (18.3%)
Processing peptide 2301/12060 (19.1%)
Processing peptide 2401/12060 (19.9%)
Processing peptide 2501/12060 (20.7%)
Processing peptide 2601/12060 (21.6%

## Protein level

In [4]:
import pandas as pd

# Load the data
peptide_file = 'example_peptide_reanalysis_updated.tsv'
protein_file = 'PeptideAtlas_proteins_not_in_MassIVE.tsv'

peptide_df = pd.read_csv(peptide_file, sep='\t')
protein_df = pd.read_csv(protein_file, sep='\t')

# Initialize the result dataframe
result_columns = [
    'Protein', 'Num_peptides_both', 'Num_peptides_MSGF', 'Num_peptides_PA',
    'Num_unique_both', 'Num_unique_MSGF', 'Num_unique_PA',
    'Num_specs_both', 'Num_specs_MSGF', 'Num_specs_PA'
]
result_df = pd.DataFrame(columns=result_columns)

# Iterate over each protein
result_list = []
for protein in protein_df['nextprot_accession']:
    # Filter peptides that match the current protein
    matched_peptides = peptide_df[
        (peptide_df['Protein identifier MSGF'] == protein) | 
        (peptide_df['Protein identifier PA'] == protein)
    ]
    
    # Calculate the required counts and sums
    num_peptides_both = len(matched_peptides[matched_peptides['Num_specs_both'] > 0])
    num_peptides_MSGF = len(matched_peptides[matched_peptides['Num_specs_MSGF'] > 0])
    num_peptides_PA = len(matched_peptides[matched_peptides['Num_specs_PA'] > 0])
    
    num_unique_both = len(matched_peptides[(matched_peptides['Num_specs_both'] > 0) & (matched_peptides['Num_genes_saap'] == 1)])
    num_unique_MSGF = len(matched_peptides[(matched_peptides['Num_specs_MSGF'] > 0) & (matched_peptides['Num_genes_saap'] == 1)])
    num_unique_PA = len(matched_peptides[(matched_peptides['Num_specs_PA'] > 0) & (matched_peptides['Num_genes_saap'] == 1)])
    
    num_specs_both = matched_peptides['Num_specs_both'].sum()
    num_specs_MSGF = matched_peptides['Num_specs_MSGF'].sum()
    num_specs_PA = matched_peptides['Num_specs_PA'].sum()
    
    # Append the results to the result list
    result_list.append({
        'Protein': protein,
        'Num_peptides_both': num_peptides_both,
        'Num_peptides_MSGF': num_peptides_MSGF,
        'Num_peptides_PA': num_peptides_PA,
        'Num_unique_both': num_unique_both,
        'Num_unique_MSGF': num_unique_MSGF,
        'Num_unique_PA': num_unique_PA,
        'Num_specs_both': num_specs_both,
        'Num_specs_MSGF': num_specs_MSGF,
        'Num_specs_PA': num_specs_PA
    })

# Convert the result list to a dataframe
result_df = pd.concat([pd.DataFrame([result]) for result in result_list], ignore_index=True)

# Save the result to a new file in TSV format
result_df.to_csv('example_protein_reanalysis.tsv', sep='\t', index=False)