# 2 | Get Peptide Counts per Protein

This notebook documents the calculation of the number of peptides recorded in each MS run for each protein. This information is used for filtering the final protein matrix. This code is written in Python. 

##### Import packages

In [1]:
import numpy as np
import pandas as pd

##### Read in master dataframe

In [2]:
master_df = pd.read_csv('/mnt/scratch/dia/Sanger28Jul21-search-srl-quant.tsv', sep='\t')
master_df.shape

(210893182, 58)

##### Filter for proteotypic and global Q value

In [3]:
filter_df = master_df[lambda x: (x['Global.Q.Value'] <= 0.01) & (x['Proteotypic'] == 1)]
filter_df.shape

(188579050, 58)

##### Create dataframe for peptide counts

In [4]:
df = filter_df[['Run','Protein.Ids','Modified.Sequence']].drop_duplicates()

##### Get sample-wise peptide counts

In [5]:
df_count = df.groupby(['Run','Protein.Ids']).count().reset_index()
df_count.to_csv('matrix_diann_normalised_051021/e0022_diann_peptide_counts_samplewise.txt', sep='\t', index=False)

##### Get cohort-wise peptide counts

In [6]:
df_hits = df[['Protein.Ids','Modified.Sequence']].drop_duplicates().groupby('Protein.Ids').count().rename(columns={'Modified.Sequence':'Peptides'})
df_hits.to_csv('matrix_diann_normalised_051021/e0022_diann_peptide_counts_cohortwise.txt', sep='\t')

##### Get peptide and sample counts for paper

In [7]:
print('Total number of modified peptides:',len(df['Modified.Sequence'].unique().tolist()))
print('Total number of proteins:',len(df['Protein.Ids'].unique().tolist()))

Total number of modified peptides: 80150
Total number of proteins: 8501


In [8]:
print('Total number of non-iRT peptides:',len(df[lambda x: ~x['Protein.Ids'].isin(['ProCal','RMISv2','RT-Kit-WR'])]['Modified.Sequence'].unique().tolist()))
print('Total number of iRT peptides:',len(df[lambda x: x['Protein.Ids'].isin(['ProCal','RMISv2','RT-Kit-WR'])]['Modified.Sequence'].unique().tolist()))
print('Total number of iRT proteins:',len(df[lambda x: x['Protein.Ids'].isin(['ProCal','RMISv2','RT-Kit-WR'])]['Protein.Ids'].unique().tolist()))

Total number of non-iRT peptides: 80085
Total number of iRT peptides: 65
Total number of iRT proteins: 3


In [9]:
print('Total number of samples:',len(df['Run'].unique().tolist()))

Total number of samples: 6980


In [10]:
print('Total number of stripped peptides:',len(filter_df['Stripped.Sequence'].unique().tolist()))

Total number of stripped peptides: 80150
