In [1]:
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# data = pd.read_excel('Malaria_Research_Data.xlsx', header=0)
# total_spectra = data.groupby('Experiment name')['Protein percentage of total spectra'].sum()
# data['Normalized protein percentage'] = (data['Protein percentage of total spectra']/data['Experiment name'].map(total_spectra))*100
# data.info()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_colwidth', None)

data = pd.read_excel('Malaria_Research_Data.xlsx', header=0)
total_spectra = data.groupby('Biological sample category')['Protein percentage of total spectra'].sum()

# Normalize the protein percentages within each group
data['Normalized protein percentage'] = data.groupby('Biological sample category')['Protein percentage of total spectra'].transform(lambda x: (x / x.sum()) * 100)

# Calculate the corrected normalized protein percentage
data['Corrected normalized protein percentage'] = data.groupby('Biological sample category')['Normalized protein percentage'].transform(lambda x: (x / x.sum()) * 100)

# print(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19264 entries, 0 to 19263
Data columns (total 20 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Experiment name                          19264 non-null  object 
 1   Biological sample category               19264 non-null  object 
 2   Protein group                            19264 non-null  object 
 3   Protein accession number                 19264 non-null  object 
 4   Protein name                             19264 non-null  object 
 5   Protein identification probability       19264 non-null  float64
 6   Protein percentage of total spectra      19264 non-null  float64
 7   Number of unique peptides                19264 non-null  int64  
 8   Number of unique spectra                 19264 non-null  int64  
 9   Number of total spectra                  19264 non-null  int64  
 10  Peptide sequence                         19264

In [8]:
# collect protein modification columns from base dataset
protein_mods_data = data[['Biological sample category', 'Protein accession number', 'Peptide sequence', 'Peptide identification probability', 'Modifications identified by spectrum']]
print(f"protein_mods_data dataset info:\n")
print(protein_mods_data.info())

# create subset of protein_mods for SPP
protein_mods_spp = protein_mods_data[protein_mods_data['Biological sample category'] == 'SPP']
# print(f"protein_mods_spp data sub-set\n")
# print(protein_mods_spp.info())

# create subset of protein_mods for NAT
protein_mods_nat = protein_mods_data[protein_mods_data['Biological sample category'] == 'NAT']
# print(f"protein_mods_nat data sub-set\n")
# print(protein_mods_nat.info())

protein_mods_data dataset info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19264 entries, 0 to 19263
Data columns (total 5 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Biological sample category            19264 non-null  object 
 1   Protein accession number              19264 non-null  object 
 2   Peptide sequence                      19264 non-null  object 
 3   Peptide identification probability    19264 non-null  float64
 4   Modifications identified by spectrum  6672 non-null   object 
dtypes: float64(1), object(4)
memory usage: 752.6+ KB
None


In [3]:
# get count of each distinct occurance of `Modifications identified by spectrum` for SPP
spp_modifications = protein_mods_spp['Modifications identified by spectrum']
print(f"The total number of modificaitons for SPP:", spp_modifications.count())
print(f"\nThe number of SPP modfications identified by spectrum:\n")
print(spp_modifications.value_counts(sort = False))

The total number of modificaitons for SPP: 3341

The number of SPP modfications identified by spectrum:

Modifications identified by spectrum
Carbamidomethyl (+57)                                                                   1039
Deamidated (+1)                                                                          922
Acetyl (+42)                                                                              77
Deamidated (+1), Carbamidomethyl (+57)                                                    39
Oxidation (+16)                                                                          549
Oxidation (+16), Acetyl (+42)                                                             29
Carbamidomethyl (+57), Carbamidomethyl (+57)                                             172
Deamidated (+1), Deamidated (+1)                                                          94
Carbamidomethyl (+57), Deamidated (+1)                                                    60
Acetyl (+42), Deamida

In [4]:
# get count of each distinct occurance of `Modifications identified by spectrum` for NAT
nat_modifications = protein_mods_nat['Modifications identified by spectrum']
print(f"The total number of modificaitons for NAT:", nat_modifications.count())
print(f"\nThe number of NAT modfications identified by spectrum:\n")
print(nat_modifications.value_counts(sort = False))

The total number of modificaitons for NAT: 3331

The number of NAT modfications identified by spectrum:

Modifications identified by spectrum
Carbamidomethyl (+57), Carbamidomethyl (+57)                                                                   147
Carbamidomethyl (+57)                                                                                          971
Acetyl (+42)                                                                                                    90
Oxidation (+16), Acetyl (+42)                                                                                   19
Deamidated (+1)                                                                                                979
Deamidated (+1), Carbamidomethyl (+57)                                                                          44
Deamidated (+1), Deamidated (+1)                                                                               103
Oxidation (+16), Carbamidomethyl (+57), Deamidated (+

In [5]:
# find which modifications exist in SPP but not NAT
spp_mods_not_in_nat = protein_mods_spp[~protein_mods_spp['Modifications identified by spectrum'].isin(protein_mods_nat['Modifications identified by spectrum'])]
print(f"\nModifcations found in SPP but not NAT\n")
# print(spp_mods_not_in_nat.value_counts(sort = False))
display(spp_mods_not_in_nat)


Modifcations found in SPP but not NAT



Unnamed: 0,Biological sample category,Protein accession number,Peptide sequence,Peptide identification probability,Modifications identified by spectrum
4956,SPP,Q8I3A3,KNDNIIQNNK,0.911,"Deamidated (+1), Deamidated (+1), Deamidated (+1), Deamidated (+1)"
12899,SPP,Q8I3T8,EMLGTCNSIGCTVDGK,0.997,"Oxidation (+16), Carbamidomethyl (+57), Deamidated (+1), Carbamidomethyl (+57)"
13271,SPP,Q8II57,QINCKNYLNEKK,0.929,"Deamidated (+1), Deamidated (+1), Carbamidomethyl (+57), Deamidated (+1)"
13536,SPP,Q8IKW5,YTEQVQDVPCGNTCCLVGVDQYIVK,0.954,"Carbamidomethyl (+57), Deamidated (+1), Carbamidomethyl (+57), Carbamidomethyl (+57)"
13546,SPP,Q8IDN6,GTLMELGISPIVTSGMVMQLLAGSK,0.997,"Oxidation (+16), Oxidation (+16), Oxidation (+16)"
16901,SPP,Q8I5J4,KVNKNDEDLNNNSK,0.975,"Deamidated (+1), Deamidated (+1), Deamidated (+1), Deamidated (+1)"
18003,SPP,Q7KQL9,AHCTEYMNAPK,0.997,"Acetyl (+42), Carbamidomethyl (+57), Oxidation (+16)"
18004,SPP,Q7KQL9,AHCTEYMNAPK,0.997,"Acetyl (+42), Carbamidomethyl (+57), Oxidation (+16)"
18264,SPP,Q8IJT9,YITEYVTCQMCK,0.979,"Carbamidomethyl (+57), Oxidation (+16), Carbamidomethyl (+57)"
18265,SPP,Q8IJT9,YITEYVTCQMCK,0.989,"Carbamidomethyl (+57), Oxidation (+16), Carbamidomethyl (+57)"


In [6]:
# find which modifications exist in NAT but not SPP
nat_mods_not_in_spp = protein_mods_nat[~protein_mods_nat['Modifications identified by spectrum'].isin(protein_mods_spp['Modifications identified by spectrum'])]
print(f"\nModifcations found in NAT but not SPP\n")
# print(spp_mods_not_in_nat.value_counts(sort = False))
display(nat_mods_not_in_spp)


Modifcations found in NAT but not SPP



Unnamed: 0,Biological sample category,Protein accession number,Peptide sequence,Peptide identification probability,Modifications identified by spectrum
1240,NAT,Q8I5Y5,FCVCCYNGPTFNR,0.934,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
1241,NAT,Q8I5Y5,FCVCCYNGPTFNR,0.938,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
1242,NAT,Q8I5Y5,FCVCCYNGPTFNR,0.997,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
1243,NAT,Q8I5Y5,FCVCCYNGPTFNR,0.997,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
1282,NAT,Q8IIC8,NKMIDYTNMLQRSK,0.945,"Oxidation (+16), Deamidated (+1), Oxidation (+16), Deamidated (+1)"
2624,NAT,Q8IBQ6,EQNVMREIKVNK,0.924,"Deamidated (+1), Oxidation (+16), Deamidated (+1)"
3010,NAT,Q8IL96,NNNDTCNEQNKDNNNNNNNNNNNNNNQLSK,0.937,"Carbamidomethyl (+57), Deamidated (+1), Deamidated (+1), Deamidated (+1)"
3275,NAT,Q8ID94|YPF12_PLAF7-DECOY,IDDPINMSSMVGPVLNNDMNTINNNVTSNKK,0.923,"Deamidated (+1), Deamidated (+1), Deamidated (+1), Deamidated (+1), Deamidated (+1)"
5719,NAT,Q8I3X4,FLCVSHGVGSAGCAVCFEELCQNGAK,0.997,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
5720,NAT,Q8I3X4,FLCVSHGVGSAGCAVCFEELCQNGAK,0.997,"Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Carbamidomethyl (+57), Deamidated (+1)"
