In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

data = pd.read_excel('Malaria_Research_Data.xlsx', header=0)
total_spectra = data.groupby('Biological sample category')['Protein percentage of total spectra'].sum()

# Normalize the protein percentages within each group
data['Normalized protein percentage'] = data.groupby('Biological sample category')['Protein percentage of total spectra'].transform(lambda x: (x / x.sum()) * 100)

# Calculate the corrected normalized protein percentage
data['Corrected normalized protein percentage'] = data.groupby('Biological sample category')['Normalized protein percentage'].transform(lambda x: (x / x.sum()) * 100)

print(data.columns)

Index(['Experiment name', 'Biological sample category', 'Protein group',
       'Protein accession number', 'Protein name',
       'Protein identification probability',
       'Protein percentage of total spectra', 'Number of unique peptides',
       'Number of unique spectra', 'Number of total spectra',
       'Peptide sequence', 'Previous amino acid', 'Next amino acid',
       'Peptide identification probability',
       'Modifications identified by spectrum', 'Peptide start index',
       'Peptide stop index', 'Unnamed: 17', 'Normalized protein percentage',
       'Corrected normalized protein percentage'],
      dtype='object')


In [8]:
# Filter proteins for SPP and NAT
spp_proteins = data[data['Biological sample category'] == 'SPP']
nat_proteins = data[data['Biological sample category'] == 'NAT']

# Calculate the mean normalized protein percentage for SPP and NAT
spp_mean_percentage = spp_proteins['Normalized protein percentage'].mean()
nat_mean_percentage = nat_proteins['Normalized protein percentage'].mean()

# Identify proteins that show up more in SPP than in NAT
enriched_proteins = spp_proteins[spp_proteins['Normalized protein percentage'] > nat_mean_percentage]

# Group the filtered data by protein name and calculate counts
protein_counts = enriched_proteins.groupby('Protein name').size().reset_index(name='Count')

# Sort the proteins based on counts in descending order
sorted_proteins = protein_counts.sort_values('Count', ascending=False)

# Create a list to store the unique GN names
unique_gn_list = []

# Print the count and names of the enriched proteins
print("Number of proteins showing up more in SPP than in NAT: {}".format(len(sorted_proteins)))
print("Protein names and their counts:")
for index, row in sorted_proteins.iterrows():
    protein_name = row['Protein name']
    count = row['Count']
    gn_index = protein_name.find('GN=')
    if gn_index != -1:
        gn_name = f"GO:{protein_name[gn_index+3:].split(' ')[0]}"
        unique_gn_list.append(gn_name)
        print("{}, Count: {}".format(gn_name, count))
    else:
        print("GN Name: Not found, Count: {}".format(count))


Number of proteins showing up more in SPP than in NAT: 52
Protein names and their counts:
GO:PF3D7_1408600, Count: 200
GO:PF3D7_0719600, Count: 167
GO:PF3D7_0818900, Count: 137
GO:PF3D7_1357000, Count: 120
GO:PF3D7_0929400, Count: 117
GO:PF3D7_1462800, Count: 105
GO:PF3D7_1027300, Count: 94
GO:PF3D7_1338200, Count: 84
GO:PF3D7_1108700, Count: 81
GO:PF3D7_0516200, Count: 78
GO:PF3D7_0917900, Count: 73
GO:PF3D7_0507100, Count: 69
GO:PF3D7_0312800, Count: 68
GO:PF3D7_1342000, Count: 66
GO:PF3D7_0618300, Count: 65
GO:PF3D7_1126200, Count: 64
GO:PF3D7_1105400, Count: 55
GO:MAL3P7.35, Count: 54
GO:PF3D7_0519400, Count: 54
GO:PF3D7_1451900, Count: 54
GO:PF3D7_0708400, Count: 53
GO:PF3D7_1242700, Count: 53
GO:PF3D7_1142500, Count: 52
GO:PF3D7_0629200, Count: 51
GO:PF3D7_0813900, Count: 51
GO:PF3D7_1347500, Count: 49
GO:PF3D7_1445900, Count: 44
GO:PF3D7_1437900, Count: 43
GO:PF3D7_1104400, Count: 43
GO:FBPA, Count: 42
GO:PF3D7_1323400, Count: 42
GO:PF3D7_1302800, Count: 42
GO:PF3D7_0306900, Cou

In [9]:
from goatools import obo_parser

# Load the Gene Ontology (GO) database
obo_file = "go-basic.obo"  # Path to the Gene Ontology file
go = obo_parser.GODag(obo_file)

# Define a function to get the GO terms for a given gene ID
def get_go_terms(gene_id):
    try:
        gene_obj = go[gene_id]  # Fetch gene object from the GO database
        go_terms = [go_term.id for go_term in gene_obj.get_all_parents()]  # Get all parent GO terms
        return go_terms
    except KeyError:
        print(f"Gene ID {gene_id} not found in the GO database.")
        return []

# Retrieve the GO terms for each gene ID
for gene_id in unique_gn_list:
    go_terms = get_go_terms(gene_id)
    if go_terms:
        print(f"GO terms for {gene_id}: {go_terms}")


go-basic.obo: fmt(1.2) rel(2023-05-10) 46,490 Terms
Gene ID GO:PF3D7_1408600 not found in the GO database.
Gene ID GO:PF3D7_0719600 not found in the GO database.
Gene ID GO:PF3D7_0818900 not found in the GO database.
Gene ID GO:PF3D7_1357000 not found in the GO database.
Gene ID GO:PF3D7_0929400 not found in the GO database.
Gene ID GO:PF3D7_1462800 not found in the GO database.
Gene ID GO:PF3D7_1027300 not found in the GO database.
Gene ID GO:PF3D7_1338200 not found in the GO database.
Gene ID GO:PF3D7_1108700 not found in the GO database.
Gene ID GO:PF3D7_0516200 not found in the GO database.
Gene ID GO:PF3D7_0917900 not found in the GO database.
Gene ID GO:PF3D7_0507100 not found in the GO database.
Gene ID GO:PF3D7_0312800 not found in the GO database.
Gene ID GO:PF3D7_1342000 not found in the GO database.
Gene ID GO:PF3D7_0618300 not found in the GO database.
Gene ID GO:PF3D7_1126200 not found in the GO database.
Gene ID GO:PF3D7_1105400 not found in the GO database.
Gene ID GO:MA