In [2]:
import pandas as pd
import numpy as np
import os
import sys
import glob 

In [19]:
dpath = "/nfs/turbo/umms-indikar/shared/projects/human_cell_cycle/annotations/"

go_to_title_map = {
    "GO:0022403": "Cell Cycle Phase",
    "GO:0051322": "Anaphase",
    "GO:0044838": "Cell Quiescence",
    "GO:0051318": "G1 Phase",
    "GO:0051319": "G2 Phase",
    "GO:0051325": "Interphase",
    "GO:0000279": "M Phase",
    "GO:0098762": "Meiotic Cell Cycle Phase",
    "GO:0098763": "Mitotic Cell Cycle Phase",
    "GO:0051324": "Prophase",
    "GO:0051320": "S Phase",
    "GO:0051326": "Telophase",
    "GO:0000082": "G1/S Transition Checkpoint Signaling",
    "GO:0000086": "G2/M Transition Checkpoint Signaling",
    "GO:0007093": "Mitotic Spindle Checkpoint Signaling"
}

file_list = glob.glob(f"{dpath}*")

result = []
for file_path in file_list:
    
    basename = os.path.basename(file_path)
    go_id = basename.replace(".csv", "").split("_")[2]
    go_id = f"GO:{go_id}"
    go_label = go_to_title_map[go_id]    
    df = pd.read_csv(file_path)
    
    df['GO Class'] = go_label
    df['GO Class Id'] = go_id
    
    result.append(df)

    
result = pd.concat(result)
result.head()

Unnamed: 0,Source,Bioentity Internal ID,Bioentity Label,Qualifier,Annotation Class,Reference,Evidence Type,Evidence With,Aspect,Bioentity Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension Class,Bioentity Isoform,GO Class,GO Class Id
0,PomBase,SPCC162.08c,nup211,,GO:0005643,PMID:20970342,IDA,,C,nucleoporin nup211,,protein,NCBITaxon:4896,20110804,PomBase,GO:0098763,,Mitotic Cell Cycle Phase,GO:0098763
1,PomBase,SPAC11E3.03,csm1,,GO:0000775,PMID:12689592,IDA,,C,microtubule-site clamp monopolin complex subun...,pcs1,protein,NCBITaxon:4896,20150212,PomBase,GO:0098763,,Mitotic Cell Cycle Phase,GO:0098763
2,PomBase,SPAC11E3.03,csm1,,GO:0005730,PMID:17627824,IDA,,C,microtubule-site clamp monopolin complex subun...,pcs1,protein,NCBITaxon:4896,20150212,PomBase,GO:0098762|GO:0098763,,Mitotic Cell Cycle Phase,GO:0098763
3,PomBase,SPAC11E3.03,csm1,,GO:0005634,PMID:17627824,IDA,,C,microtubule-site clamp monopolin complex subun...,pcs1,protein,NCBITaxon:4896,20150212,PomBase,GO:0098762|GO:0098763,,Mitotic Cell Cycle Phase,GO:0098763
4,PomBase,SPCC970.12,mis18,,GO:0000776,PMID:15369671,IDA,,C,kinetochore protein Mis18,,protein,NCBITaxon:4896,20140829,PomBase,GO:0098763,,Mitotic Cell Cycle Phase,GO:0098763


# Filters

In [25]:
df = result.copy()


# get only human
df = df[df['Taxon'] == 'NCBITaxon:9606']
print(f"{df.shape=}")

# get only specific pathways 
pathways = [
    "G1 Phase",
    "G2 Phase",
    "M Phase",
    "S Phase",
    "G1/S Transition Checkpoint Signaling",
    "G2/M Transition Checkpoint Signaling",
]

df = df[df['GO Class'].isin(pathways)]
df['val'] = 1
print(f"{df.shape=}")

df.head()

df.shape=(742, 19)
df.shape=(323, 20)


Unnamed: 0,Source,Bioentity Internal ID,Bioentity Label,Qualifier,Annotation Class,Reference,Evidence Type,Evidence With,Aspect,Bioentity Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension Class,Bioentity Isoform,GO Class,GO Class Id,val
0,UniProtKB,Q9Y3I1,FBXO7,,GO:0005634,PMID:21378169,IDA,,C,F-box only protein 7,FBX7,protein,NCBITaxon:9606,20160811,ParkinsonsUK-UCL,GO:0044843|GO:0051319,,G2 Phase,GO:0051319,1
175,UniProtKB,Q9BRX2,PELO,,GO:0170011,PMID:27863242,IDA,,F,Protein pelota homolog,CGI-17,protein,NCBITaxon:9606,20230223,UniProt,GO:0070966|GO:0072344|GO:0022626|GO:0017111,,G2 Phase,GO:0051319,1
177,UniProtKB,P35462,DRD3,,GO:0001591,GO_REF:0000024,ISS,UniProtKB:P19020,F,D(3) dopamine receptor,,protein,NCBITaxon:9606,20080903,BHF-UCL,,,G2 Phase,GO:0051319,1
178,UniProtKB,P35462,DRD3,,GO:0001591,PMID:8301582,IDA,,F,D(3) dopamine receptor,,protein,NCBITaxon:9606,20080902,BHF-UCL,,,G2 Phase,GO:0051319,1
179,UniProtKB,P35462,DRD3,,GO:0001591,GO_REF:0000033,IBA,PANTHER:PTN000664816|UniProtKB:P35462|FB:FBgn0...,F,D(3) dopamine receptor,,protein,NCBITaxon:9606,20200330,GO_Central,,,G2 Phase,GO:0051319,1


# Pivot and Save

In [33]:
pdf = pd.pivot_table(
    df, 
    index='Bioentity Label',
    columns='GO Class',
    values='val',
    fill_value=False,
).astype(bool).reset_index()

pdf = pdf.rename(columns={'Bioentity Label' : 'gene_name'})

print(f"{pdf.shape=}")

outpath = "/nfs/turbo/umms-indikar/shared/projects/human_cell_cycle/human_cell_cycle_genes.csv"
pdf.to_csv(outpath, index=False)

pdf.head()

pdf.shape=(142, 7)


GO Class,gene_name,G1 Phase,G1/S Transition Checkpoint Signaling,G2 Phase,G2/M Transition Checkpoint Signaling,M Phase,S Phase
0,ABCB1,False,False,False,True,False,False
1,ACVR1B,False,True,False,False,False,False
2,AMBRA1,False,True,False,False,False,False
3,APP,False,False,False,True,False,False
4,ARPP19,False,False,False,True,False,False


In [31]:
pdf.columns

Index(['Bioentity Label', 'G1 Phase', 'G1/S Transition Checkpoint Signaling',
       'G2 Phase', 'G2/M Transition Checkpoint Signaling', 'M Phase',
       'S Phase'],
      dtype='object', name='GO Class')

In [45]:
pdf[pdf['G2 Phase']]

GO Class,gene_name,G1 Phase,G1/S Transition Checkpoint Signaling,G2 Phase,G2/M Transition Checkpoint Signaling,M Phase,S Phase
41,CNOT6,True,True,True,True,True,True
42,CSDE1,True,True,True,True,True,True
51,DRD2,True,True,True,True,True,True
52,DRD3,True,True,True,True,True,True
53,DRD4,True,True,True,True,True,True
61,FBXO7,False,False,True,False,False,False
65,HBS1L,True,True,True,True,True,True
83,PELO,True,True,True,True,True,True
