In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import swifter

from pyensembl import EnsemblRelease
from pyensembl.genome import Genome
from pyensembl.shell import collect_all_installed_ensembl_releases
collect_all_installed_ensembl_releases()

  from .autonotebook import tqdm as notebook_tqdm


[EnsemblRelease(release=75, species='homo_sapiens'),
 EnsemblRelease(release=111, species='homo_sapiens')]

# Load UKB BIM

In [34]:
data_dir = "/cluster/project/beltrao/gankin/vnn/data"

# Read the .bim file to get SNP information
bim_file = data_dir + "/ukb.bim"
bim_df = pd.read_csv(bim_file, sep='\t', header=None)
bim_df.columns = ['chrom', 'snp', 'cm', 'pos', 'a1', 'a2']
bim_df

  bim_df = pd.read_csv(bim_file, sep='\t', header=None)


Unnamed: 0,chrom,snp,cm,pos,a1,a2
0,1,rs28659788,0,723307,C,G
1,1,rs116587930,0,727841,G,A
2,1,rs116720794,0,729632,C,T
3,1,rs3131972,0,752721,A,G
4,1,rs12184325,0,754105,C,T
...,...,...,...,...,...,...
805421,MT,Affx-92047842,0,16337,C,T
805422,MT,Affx-79443531,0,16356,T,C
805423,MT,Affx-79443532,0,16362,T,C
805424,MT,Affx-89025709,0,16390,G,A


# Load Open Targets data

In [35]:
variant_index_parquet = data_dir + "/complete_variant_index.parquet"
variant_index = pd.read_parquet(variant_index_parquet) # takes a few minutes
variant_index

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,gene_id_any_distance,gene_id_any,gene_id_prot_coding_distance,gene_id_prot_coding
0,1,30794,G,T,1,30794,rs1311411458,intron_variant,"{'phred': 0.177, 'raw': -0.402967}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",34625.0,ENSG00000186092,34625.0,ENSG00000186092
1,1,55330,G,A,1,55330,rs185215913,downstream_gene_variant,"{'phred': 0.431, 'raw': -0.275244}","{'gnomad_afr': 0.016711590296495958, 'gnomad_a...",10089.0,ENSG00000186092,10089.0,ENSG00000186092
2,1,57248,C,T,1,57248,rs1249676487,downstream_gene_variant,"{'phred': 7.268, 'raw': 0.304146}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",8171.0,ENSG00000186092,8171.0,ENSG00000186092
3,1,66381,T,A,1,66381,rs538833530,upstream_gene_variant,"{'phred': 1.389, 'raw': -0.101295}","{'gnomad_afr': 0.15321011673151752, 'gnomad_am...",962.0,ENSG00000186092,962.0,ENSG00000186092
4,1,67631,G,C,1,67631,rs533896527,upstream_gene_variant,"{'phred': 0.033, 'raw': -0.632235}","{'gnomad_afr': 0.041812315138478086, 'gnomad_a...",2212.0,ENSG00000186092,2212.0,ENSG00000186092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72878704,X,156006466,G,C,X,155236131,rs1218662062,intron_variant,"{'phred': 1.118, 'raw': -0.133403}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",8770.0,ENSG00000124334,8770.0,ENSG00000124334
72878705,X,156006493,C,T,X,155236158,rs1417497789,intron_variant,"{'phred': 1.776, 'raw': -0.063666}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0011848341...",8797.0,ENSG00000124334,8797.0,ENSG00000124334
72878706,X,156008488,C,T,X,155238153,rs186430584,intron_variant,"{'phred': 3.876, 'raw': 0.076619}","{'gnomad_afr': 0.030184331797235023, 'gnomad_a...",10792.0,ENSG00000124334,10792.0,ENSG00000124334
72878707,X,156009035,CTG,C,X,155238700,rs1381721936,intron_variant,"{'phred': None, 'raw': None}","{'gnomad_afr': 0.0008291873963515755, 'gnomad_...",11339.0,ENSG00000124334,11339.0,ENSG00000124334


# Merge with UKB SNP data

In [36]:
# merge bim with variant index
ot_bim = variant_index.merge(bim_df, left_on='rs_id', right_on='snp', how='inner')
ot_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,gene_id_any_distance,gene_id_any,gene_id_prot_coding_distance,gene_id_prot_coding,chrom,snp,cm,pos,a1,a2
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",4676.0,ENSG00000162571,4676.0,ENSG00000162571,1,rs112695918,0,1104584,T,C
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",12098.0,ENSG00000067606,12098.0,ENSG00000067606,1,rs74372554,0,1969752,C,T
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",18616.0,ENSG00000162585,18616.0,ENSG00000162585,1,rs146607591,0,2125543,A,G
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",6177.0,ENSG00000157870,6177.0,ENSG00000157870,1,rs146100312,0,2524107,C,T
4,1,2828773,C,A,1,2745338,rs116076619,intergenic_variant,"{'phred': 0.058, 'raw': -0.555543}","{'gnomad_afr': 0.0060821666284140465, 'gnomad_...",27080.0,ENSG00000215912,27080.0,ENSG00000215912,1,rs116076619,0,2745338,C,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747754,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",39883.0,ENSG00000102287,39883.0,ENSG00000102287,X,rs145754054,0,151183035,G,A
747755,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",10996.0,ENSG00000183479,10996.0,ENSG00000183479,X,rs5945389,0,152725049,C,T
747756,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",5128.0,ENSG00000184343,5128.0,ENSG00000184343,X,rs199915647,0,153046995,A,C
747757,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",6780.0,ENSG00000269335,6780.0,ENSG00000269335,X,rs5030868,0,153762634,G,A


# Gene ids to gene names

In [37]:
human_data = EnsemblRelease(species='human', release=111)
def get_gene_name(ensembl_id):
    try:
        return human_data.gene_name_of_gene_id(ensembl_id)
    except:
        return None

In [38]:
name_map = pd.DataFrame()
name_map["gene_id"] = ot_bim.gene_id_prot_coding.unique()
name_map["gene_name"]  = name_map["gene_id"].swifter.apply(lambda x: get_gene_name(x))

Pandas Apply: 100%|██████████| 18817/18817 [00:23<00:00, 811.23it/s]


In [39]:
print("Gene Id not mappable: ", name_map.gene_name.isnull().sum())
print("Unique gene names: ", name_map.gene_name.nunique())

Gene Id not mappable:  3
Unique gene names:  18810


In [40]:
# count None in gene names v75
print("Gene Id not mappable: ", name_map.gene_name.isnull().sum())
print("Unique gene names: ", name_map.gene_name.nunique())

Gene Id not mappable:  3
Unique gene names:  18810


In [41]:
ot_bim = pd.merge(ot_bim, name_map, left_on='gene_id_prot_coding', right_on='gene_id', how='left')

In [42]:
# remove all None gene names
ot_bim = ot_bim[ot_bim.gene_name.notnull()]
ot_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,gene_id_prot_coding_distance,gene_id_prot_coding,chrom,snp,cm,pos,a1,a2,gene_id,gene_name
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,4676.0,ENSG00000162571,1,rs112695918,0,1104584,T,C,ENSG00000162571,TTLL10
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,12098.0,ENSG00000067606,1,rs74372554,0,1969752,C,T,ENSG00000067606,PRKCZ
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",...,18616.0,ENSG00000162585,1,rs146607591,0,2125543,A,G,ENSG00000162585,FAAP20
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",...,6177.0,ENSG00000157870,1,rs146100312,0,2524107,C,T,ENSG00000157870,PRXL2B
4,1,2828773,C,A,1,2745338,rs116076619,intergenic_variant,"{'phred': 0.058, 'raw': -0.555543}","{'gnomad_afr': 0.0060821666284140465, 'gnomad_...",...,27080.0,ENSG00000215912,1,rs116076619,0,2745338,C,A,ENSG00000215912,TTC34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747754,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",...,39883.0,ENSG00000102287,X,rs145754054,0,151183035,G,A,ENSG00000102287,GABRE
747755,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",...,10996.0,ENSG00000183479,X,rs5945389,0,152725049,C,T,ENSG00000183479,TREX2
747756,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",...,5128.0,ENSG00000184343,X,rs199915647,0,153046995,A,C,ENSG00000184343,SRPK3
747757,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,6780.0,ENSG00000269335,X,rs5030868,0,153762634,G,A,ENSG00000269335,IKBKG


In [43]:
ot_bim.rs_id.nunique()

737328

In [44]:
ot_bim = ot_bim.drop_duplicates(subset=["rs_id", "gene_name"])
# rename rs_id col to snp
ot_bim = ot_bim.rename({"rs_id":"snp"})
ot_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,gene_id_prot_coding_distance,gene_id_prot_coding,chrom,snp,cm,pos,a1,a2,gene_id,gene_name
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,4676.0,ENSG00000162571,1,rs112695918,0,1104584,T,C,ENSG00000162571,TTLL10
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,12098.0,ENSG00000067606,1,rs74372554,0,1969752,C,T,ENSG00000067606,PRKCZ
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",...,18616.0,ENSG00000162585,1,rs146607591,0,2125543,A,G,ENSG00000162585,FAAP20
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",...,6177.0,ENSG00000157870,1,rs146100312,0,2524107,C,T,ENSG00000157870,PRXL2B
4,1,2828773,C,A,1,2745338,rs116076619,intergenic_variant,"{'phred': 0.058, 'raw': -0.555543}","{'gnomad_afr': 0.0060821666284140465, 'gnomad_...",...,27080.0,ENSG00000215912,1,rs116076619,0,2745338,C,A,ENSG00000215912,TTC34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747753,X,151609900,A,G,X,150778372,rs58448897,intron_variant,"{'phred': 3.523, 'raw': 0.055668}","{'gnomad_afr': 0.11482740118046861, 'gnomad_am...",...,46225.0,ENSG00000166049,X,rs58448897,0,150778372,A,G,ENSG00000166049,PASD1
747754,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",...,39883.0,ENSG00000102287,X,rs145754054,0,151183035,G,A,ENSG00000102287,GABRE
747755,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",...,10996.0,ENSG00000183479,X,rs5945389,0,152725049,C,T,ENSG00000183479,TREX2
747756,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",...,5128.0,ENSG00000184343,X,rs199915647,0,153046995,A,C,ENSG00000184343,SRPK3


In [45]:
# get snp_bed_id from bim_df 
bim_df = bim_df.reset_index()
#bim_df.columns = ['snp_bed_id', 'chrom', 'snp', 'cm', 'pos', 'a1', 'a2']
bim_df.columns = ['snp_bed_id', 'chrom', 'snp', 'cm', 'pos', 'a1', 'a2']
bim_df

Unnamed: 0,snp_bed_id,chrom,snp,cm,pos,a1,a2
0,0,1,rs28659788,0,723307,C,G
1,1,1,rs116587930,0,727841,G,A
2,2,1,rs116720794,0,729632,C,T
3,3,1,rs3131972,0,752721,A,G
4,4,1,rs12184325,0,754105,C,T
...,...,...,...,...,...,...,...
805421,805421,MT,Affx-92047842,0,16337,C,T
805422,805422,MT,Affx-79443531,0,16356,T,C
805423,805423,MT,Affx-79443532,0,16362,T,C
805424,805424,MT,Affx-89025709,0,16390,G,A


In [46]:
ot_bim = ot_bim.merge(bim_df[["snp", "snp_bed_id"]], left_on="snp", right_on="snp")

In [47]:
ot_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,gene_id_prot_coding,chrom,snp,cm,pos,a1,a2,gene_id,gene_name,snp_bed_id
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,ENSG00000162571,1,rs112695918,0,1104584,T,C,ENSG00000162571,TTLL10,220
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,ENSG00000067606,1,rs74372554,0,1969752,C,T,ENSG00000067606,PRKCZ,599
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",...,ENSG00000162585,1,rs146607591,0,2125543,A,G,ENSG00000162585,FAAP20,648
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",...,ENSG00000157870,1,rs146100312,0,2524107,C,T,ENSG00000157870,PRXL2B,838
4,1,2828773,C,A,1,2745338,rs116076619,intergenic_variant,"{'phred': 0.058, 'raw': -0.555543}","{'gnomad_afr': 0.0060821666284140465, 'gnomad_...",...,ENSG00000215912,1,rs116076619,0,2745338,C,A,ENSG00000215912,TTC34,889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737323,X,151609900,A,G,X,150778372,rs58448897,intron_variant,"{'phred': 3.523, 'raw': 0.055668}","{'gnomad_afr': 0.11482740118046861, 'gnomad_am...",...,ENSG00000166049,X,rs58448897,0,150778372,A,G,ENSG00000166049,PASD1,802181
737324,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",...,ENSG00000102287,X,rs145754054,0,151183035,G,A,ENSG00000102287,GABRE,802336
737325,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",...,ENSG00000183479,X,rs5945389,0,152725049,C,T,ENSG00000183479,TREX2,802547
737326,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",...,ENSG00000184343,X,rs199915647,0,153046995,A,C,ENSG00000184343,SRPK3,802648


# Load GO term to gene mapping

In [48]:
import networkx as nx
import pickle 

In [None]:
graph_dir = "../../ontology_operations/data/networks/"
# CC_is_a+part_of+regulates.graphml
# BP_is_a+part_of+regulates.graphml
# BP_is_a.graphml
# BP_is_a+part_of+regulates.graphml
# BP+CC_is_a+part_of+regulates.graphml
# BP+CC_is_a.graphml

In [108]:
#go_path = "../../data/GO_biological_process.pickle"
#go_path = "../../ontology_operations/data/GO_cellular_component.pickle"
#G = pickle.load(open(go_path, 'rb'))
# networks load from graphml
G = nx.read_graphml("../../ontology_operations/data/networks/BP_is_a.graphml") 
#G = nx.read_graphml("../../ontology_operations/data/networks/CC_is_a+part_of+regulates.graphml") 

In [118]:
# Load go terms and genes
#goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_biological_processes_merged_EBI_uniprot.csv", index_col=0)
#goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_cellular_components_merged_EBI_uniprot.csv", index_col=0)
goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_bp_cc_merged_EBI_uniprot.csv", index_col=0)

In [119]:
goterms_df["gene_name"] = goterms_df["merged"].apply(lambda x: x.split(";"))
goterms_df = goterms_df.explode("gene_name")
goterms_df

Unnamed: 0,merged,len_merged,name,gene_name
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,C20orf72
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,AAC1
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,ECGF1
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,SLC25A33
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,SLC25A4
...,...,...,...,...
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,OK/SW-cl.26
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,RNF78
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,RFPL1L
GO:2001304,LTB4H;CYP4F3,2,lipoxin B4 metabolic process,LTB4H


# Filter genes

In [120]:
gene_df = pd.DataFrame()
gene_df["gene_name"] = goterms_df.gene_name.unique()

In [121]:
go_bim = ot_bim.merge(gene_df, on="gene_name")

In [113]:
go_bim.gene_name.nunique() # some genes not mapped to go terms ..

17588

In [101]:
go_bim.gene_name.nunique() # some genes not mapped to go terms ..

17588

In [114]:
go_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,gene_id_prot_coding,chrom,snp,cm,pos,a1,a2,gene_id,gene_name,snp_bed_id
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,ENSG00000162571,1,rs112695918,0,1104584,T,C,ENSG00000162571,TTLL10,220
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,ENSG00000067606,1,rs74372554,0,1969752,C,T,ENSG00000067606,PRKCZ,599
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",...,ENSG00000162585,1,rs146607591,0,2125543,A,G,ENSG00000162585,FAAP20,648
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",...,ENSG00000157870,1,rs146100312,0,2524107,C,T,ENSG00000157870,PRXL2B,838
4,1,3041036,T,C,1,2957600,rs12409277,intergenic_variant,"{'phred': 1.823, 'raw': -0.059547}","{'gnomad_afr': 0.13893357848770396, 'gnomad_am...",...,ENSG00000169717,1,rs12409277,0,2957600,T,C,ENSG00000169717,ACTRT2,958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695636,X,151609900,A,G,X,150778372,rs58448897,intron_variant,"{'phred': 3.523, 'raw': 0.055668}","{'gnomad_afr': 0.11482740118046861, 'gnomad_am...",...,ENSG00000166049,X,rs58448897,0,150778372,A,G,ENSG00000166049,PASD1,802181
695637,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",...,ENSG00000102287,X,rs145754054,0,151183035,G,A,ENSG00000102287,GABRE,802336
695638,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",...,ENSG00000183479,X,rs5945389,0,152725049,C,T,ENSG00000183479,TREX2,802547
695639,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",...,ENSG00000184343,X,rs199915647,0,153046995,A,C,ENSG00000184343,SRPK3,802648


In [122]:
gene_df = pd.DataFrame()
gene_df["gene_name"] = go_bim.gene_name.unique()
gene_df.reset_index(inplace=True)
gene_df.columns = ["gene_bed_id", "gene_name"]
gene_df

Unnamed: 0,gene_bed_id,gene_name
0,0,TTLL10
1,1,PRKCZ
2,2,FAAP20
3,3,PRXL2B
4,4,ACTRT2
...,...,...
17583,17583,HOXA3
17584,17584,TAF6
17585,17585,TPRN
17586,17586,NOMO1


In [123]:
go_bim = go_bim.merge(gene_df, on="gene_name")
go_bim

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,chrom,snp,cm,pos,a1,a2,gene_id,gene_name,snp_bed_id,gene_bed_id
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,1,rs112695918,0,1104584,T,C,ENSG00000162571,TTLL10,220,0
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,1,rs74372554,0,1969752,C,T,ENSG00000067606,PRKCZ,599,1
2,1,2194104,A,G,1,2125543,rs146607591,missense_variant,"{'phred': 4.241, 'raw': 0.098415}","{'gnomad_afr': 0.00034498620055197794, 'gnomad...",...,1,rs146607591,0,2125543,A,G,ENSG00000162585,FAAP20,648,2
3,1,2592668,C,T,1,2524107,rs146100312,missense_variant,"{'phred': 23.9, 'raw': 3.118097}","{'gnomad_afr': 0.0, 'gnomad_amr': 0.0, 'gnomad...",...,1,rs146100312,0,2524107,C,T,ENSG00000157870,PRXL2B,838,3
4,1,3041036,T,C,1,2957600,rs12409277,intergenic_variant,"{'phred': 1.823, 'raw': -0.059547}","{'gnomad_afr': 0.13893357848770396, 'gnomad_am...",...,1,rs12409277,0,2957600,T,C,ENSG00000169717,ACTRT2,958,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695636,X,151609900,A,G,X,150778372,rs58448897,intron_variant,"{'phred': 3.523, 'raw': 0.055668}","{'gnomad_afr': 0.11482740118046861, 'gnomad_am...",...,X,rs58448897,0,150778372,A,G,ENSG00000166049,PASD1,802181,10476
695637,X,152014563,G,A,X,151183035,rs145754054,intergenic_variant,"{'phred': 2.456, 'raw': -0.011208}","{'gnomad_afr': 0.009298393913778529, 'gnomad_a...",...,X,rs145754054,0,151183035,G,A,ENSG00000102287,GABRE,802336,8320
695638,X,153459591,C,T,X,152725049,rs5945389,intron_variant,"{'phred': 2.968, 'raw': 0.022059}","{'gnomad_afr': 0.0032312925170068026, 'gnomad_...",...,X,rs5945389,0,152725049,C,T,ENSG00000183479,TREX2,802547,2616
695639,X,153781540,A,C,X,153046995,rs199915647,missense_variant,"{'phred': 22.5, 'raw': 2.480113}","{'gnomad_afr': 0.00017018379850238256, 'gnomad...",...,X,rs199915647,0,153046995,A,C,ENSG00000184343,SRPK3,802648,14122


In [124]:
all_snp_ids = go_bim.snp_bed_id.unique() # list of all snp ids, enumerate and map to ids
snp_id_dict = dict(zip(all_snp_ids, range(len(all_snp_ids))))
# snp position in gene_level bed file
go_bim['snp_gbed_id'] = go_bim['snp_bed_id'].map(snp_id_dict)

In [106]:
go_bim[["snp", "gene_name", "snp_bed_id", "snp_gbed_id", "gene_bed_id"]]

Unnamed: 0,snp,gene_name,snp_bed_id,snp_gbed_id,gene_bed_id
0,rs112695918,TTLL10,220,0,0
1,rs74372554,PRKCZ,599,1,1
2,rs146607591,FAAP20,648,2,2
3,rs146100312,PRXL2B,838,3,3
4,rs12409277,ACTRT2,958,4,4
...,...,...,...,...,...
695636,rs58448897,PASD1,802181,695636,10476
695637,rs145754054,GABRE,802336,695637,8320
695638,rs5945389,TREX2,802547,695638,2616
695639,rs199915647,SRPK3,802648,695639,14122


In [126]:
go_bim[["snp", "gene_name", "snp_bed_id", "snp_gbed_id", "gene_bed_id"]].to_csv("../data/go_general_bim.csv", index=False)#, sep="\t")

In [None]:
#go_bim.to_csv("../data/go_biological_processes_bim_full.csv", index=False, sep="\t")

# Prune network

In [127]:
graph_dir = "../../ontology_operations/data/networks/"
# CC_is_a+part_of+regulates.graphml
# BP_is_a+part_of+regulates.graphml
# BP_is_a.graphml
# BP_is_a+part_of+regulates.graphml
# BP+CC_is_a+part_of+regulates.graphml
# BP+CC_is_a.graphml

In [183]:
import networkx as nx
import pickle

#go_path = "../../ontology_operations/data/GO_biological_process.pickle"
#go_path = "../../ontology_operations/data/GO_cellular_component_trimmed.pickle"
#G = pickle.load(open(go_path, 'rb'))
#G = nx.read_graphml("../../ontology_operations/data/networks/CC_is_a.graphml")
G = nx.read_graphml(graph_dir + "BP+CC_is_a+part_of+regulates.graphml")

In [184]:
# Load go terms and genes
#goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_biological_processes_merged_EBI_uniprot.csv", index_col=0)
#goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_cellular_components_merged_EBI_uniprot.csv", index_col=0)
goterms_df = pd.read_csv("../../ontology_operations/data/GO_terms_bp_cc_merged_EBI_uniprot.csv", index_col=0)
goterms_df["gene_name"] = goterms_df["merged"].apply(lambda x: x.split(";"))
goterms_df = goterms_df.explode("gene_name")
goterms_df

Unnamed: 0,merged,len_merged,name,gene_name
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,C20orf72
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,AAC1
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,ECGF1
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,SLC25A33
GO:0000002,C20orf72;AAC1;ECGF1;SLC25A33;SLC25A4;PIF1;SSBP...,22,mitochondrial genome maintenance,SLC25A4
...,...,...,...,...
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,OK/SW-cl.26
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,RNF78
GO:2001272,RPS3;RFPL1;OK/SW-cl.26;RNF78;RFPL1L,5,positive regulation of cysteine-type endopepti...,RFPL1L
GO:2001304,LTB4H;CYP4F3,2,lipoxin B4 metabolic process,LTB4H


In [7]:
go_bim = pd.read_csv("../data/go_cc_is_a_bim.csv")

In [185]:
merged_go = go_bim.merge(goterms_df.reset_index(), on="gene_name") # .... lot's of multiple connections

In [30]:
merged_go

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,a2,gene_id,gene_name,snp_bed_id,gene_bed_id,snp_gbed_id,index,merged,len_merged,name
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,C,ENSG00000162571,TTLL10,220,0,0,GO:0005829,KRTAP4.14;BBS5;MOCS1;KIAA0992;SH3BP1;RACGAP1;G...,13108,cytosol
1,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,C,ENSG00000162571,TTLL10,220,0,0,GO:0018094,TTLL8;TTLL10;TTLL3,3,protein polyglycylation
2,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0000226,PP2464;TBCE;EML3;KIAA1319;CCDC8;C9orf20;CDK5RA...,317,microtubule cytoskeleton organization
3,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0001725,TEK;DST;KIAA0992;PRICKLE4;MLCK;SPAL3;OEBT;ABLI...,199,stress fiber
4,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0001954,CD36;C20orf42;URP1;SCYA28;A-152E5.2;CCL25;SKAP...,71,positive regulation of cell-matrix adhesion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8360599,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0050862,TESPA1;KIAA0349;CCR7;TAPA1;RAB7L1;TSPAN28;NEMO...,52,positive regulation of T cell receptor signali...
8360600,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0051092,FXY2;IRAK;TRIM14;NPM;TIRAP;MRP8;OATP3A1;PYCARD...,322,positive regulation of NF-kappaB transcription...
8360601,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0051650,RAB7A;TESK1;ATP6N1C;TCIRG1;RAB11A;KIAA0080;RAB...,14,establishment of vesicle localization
8360602,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0072686,PRKM2;TRAPPC14;KNSL2;KIF11;DCDC2;PKP4;EML3;KIA...,342,mitotic spindle


In [29]:
merged_go

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,a2,gene_id,gene_name,snp_bed_id,gene_bed_id,snp_gbed_id,index,merged,len_merged,name
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,C,ENSG00000162571,TTLL10,220,0,0,GO:0005829,M130;NFATC1;GGCT;PKDL;TORC3;NIP2;CPGL;KNSL5;DN...,13464,cytosol
1,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0001725,C20orf28;CTTNBP2NL;LPP;KIAA1296;KIAA0991;DRR1;...,211,stress fiber
2,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0005634,DHX38;B4E246;NFATC1;CRAMP1L;ZNF526;CCNP;TORC3;...,17387,nucleus
3,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0005737,LSLCL;NFATC1;CCNP;TORC3;NIP2;B2R7M0;ATG16L2;GO...,16823,cytoplasm
4,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0005768,KIAA0319;A0A384ME16;PKN;ARL8B;B3KQD2;B4E2S7;TM...,1498,endosome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2946977,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,13478,644893,GO:0005634,DHX38;B4E246;NFATC1;CRAMP1L;ZNF526;CCNP;TORC3;...,17387,nucleus
2946978,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,13478,644893,GO:0005654,DHX38;NFATC1;TTF1;TORC3;CPGL;C20orf77;PRO1146;...,10042,nucleoplasm
2946979,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,13478,644893,GO:0005737,LSLCL;NFATC1;CCNP;TORC3;NIP2;B2R7M0;ATG16L2;GO...,16823,cytoplasm
2946980,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,13478,644893,GO:0005829,M130;NFATC1;GGCT;PKDL;TORC3;NIP2;CPGL;KNSL5;DN...,13464,cytosol


In [31]:
merged_go["index"].nunique()

13171

In [71]:
merged_go["index"].nunique()

13171

In [186]:
go_terms = merged_go["index"].unique()

In [187]:
go_terms = {i.replace(":", "_") for i in go_terms}
go_terms

{'GO_0016116',
 'GO_0090651',
 'GO_0032976',
 'GO_0000578',
 'GO_0045793',
 'GO_0006693',
 'GO_0001916',
 'GO_1902723',
 'GO_0009450',
 'GO_0120045',
 'GO_1905456',
 'GO_0019377',
 'GO_0007266',
 'GO_1902565',
 'GO_0001520',
 'GO_0002182',
 'GO_0048714',
 'GO_1905523',
 'GO_0034021',
 'GO_2001246',
 'GO_0035176',
 'GO_0035633',
 'GO_0061753',
 'GO_0071902',
 'GO_0040029',
 'GO_0045584',
 'GO_0006121',
 'GO_0010881',
 'GO_0030167',
 'GO_0030043',
 'GO_0042472',
 'GO_2000734',
 'GO_0006032',
 'GO_0071584',
 'GO_0009188',
 'GO_0032055',
 'GO_0048621',
 'GO_0072579',
 'GO_0003289',
 'GO_0090091',
 'GO_0009240',
 'GO_0021775',
 'GO_0090176',
 'GO_0003108',
 'GO_0010157',
 'GO_0006547',
 'GO_0034163',
 'GO_1902474',
 'GO_0051260',
 'GO_1990584',
 'GO_0033263',
 'GO_0016560',
 'GO_1903331',
 'GO_0048566',
 'GO_0046327',
 'GO_1905162',
 'GO_0009386',
 'GO_0021979',
 'GO_0006809',
 'GO_1903779',
 'GO_1902340',
 'GO_0034399',
 'GO_0060710',
 'GO_2000627',
 'GO_0101031',
 'GO_1990953',
 'GO_00140

In [188]:
# only keep terms that are also in the graph
graph_terms = {i.replace("_",":") for i in G.nodes}
merged_go = merged_go[merged_go["index"].isin(graph_terms)]
merged_go

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,cadd,af,...,a2,gene_id,gene_name,snp_bed_id,gene_bed_id,snp_gbed_id,index,merged,len_merged,name
0,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,C,ENSG00000162571,TTLL10,220,0,0,GO:0005829,KRTAP4.14;BBS5;MOCS1;KIAA0992;SH3BP1;RACGAP1;G...,13108,cytosol
1,1,1169204,T,C,1,1104584,rs112695918,upstream_gene_variant,"{'phred': 0.61, 'raw': -0.223786}","{'gnomad_afr': 0.1370131366674349, 'gnomad_amr...",...,C,ENSG00000162571,TTLL10,220,0,0,GO:0018094,TTLL8;TTLL10;TTLL3,3,protein polyglycylation
2,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0000226,PP2464;TBCE;EML3;KIAA1319;CCDC8;C9orf20;CDK5RA...,317,microtubule cytoskeleton organization
3,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0001725,TEK;DST;KIAA0992;PRICKLE4;MLCK;SPAL3;OEBT;ABLI...,199,stress fiber
4,1,2038313,C,T,1,1969752,rs74372554,intergenic_variant,"{'phred': 0.309, 'raw': -0.324059}","{'gnomad_afr': 0.01078971533516988, 'gnomad_am...",...,T,ENSG00000067606,PRKCZ,599,1,1,GO:0001954,CD36;C20orf42;URP1;SCYA28;A-152E5.2;CCL25;SKAP...,71,positive regulation of cell-matrix adhesion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8360599,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0050862,TESPA1;KIAA0349;CCR7;TAPA1;RAB7L1;TSPAN28;NEMO...,52,positive regulation of T cell receptor signali...
8360600,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0051092,FXY2;IRAK;TRIM14;NPM;TIRAP;MRP8;OATP3A1;PYCARD...,322,positive regulation of NF-kappaB transcription...
8360601,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0051650,RAB7A;TESK1;ATP6N1C;TCIRG1;RAB11A;KIAA0080;RAB...,14,establishment of vesicle localization
8360602,X,154534419,G,A,X,153762634,rs5030868,missense_variant,"{'phred': 24.7, 'raw': 3.374046}","{'gnomad_afr': 0.0001721763085399449, 'gnomad_...",...,A,ENSG00000269335,IKBKG,802968,14718,695640,GO:0072686,PRKM2;TRAPPC14;KNSL2;KIF11;DCDC2;PKP4;EML3;KIA...,342,mitotic spindle


In [75]:
graph_terms

{'GO:0070014',
 'GO:0009346',
 'GO:0070112',
 'GO:0070313',
 'GO:0070826',
 'GO:0097002',
 'GO:1990658',
 'GO:0044166',
 'GO:0035354',
 'GO:0034064',
 'GO:1903349',
 'GO:0032279',
 'GO:0106003',
 'GO:0039643',
 'GO:0016605',
 'GO:0060187',
 'GO:0005846',
 'GO:0061671',
 'GO:1905506',
 'GO:0071808',
 'GO:0098652',
 'GO:0033588',
 'GO:0071740',
 'GO:0000446',
 'GO:0097560',
 'GO:0071136',
 'GO:0005722',
 'GO:0039625',
 'GO:0098539',
 'GO:0033597',
 'GO:0031313',
 'GO:0071204',
 'GO:0098031',
 'GO:0002177',
 'GO:0031934',
 'GO:0140593',
 'GO:0120280',
 'GO:1990358',
 'GO:0061176',
 'GO:0042567',
 'GO:1990233',
 'GO:0044731',
 'GO:0120260',
 'GO:0009358',
 'GO:0042566',
 'GO:0061694',
 'GO:0000323',
 'GO:0044676',
 'GO:1990060',
 'GO:0034709',
 'GO:0160064',
 'GO:1990862',
 'GO:0061175',
 'GO:0009331',
 'GO:0036007',
 'GO:0170047',
 'GO:0000936',
 'GO:1990338',
 'GO:0098570',
 'GO:0160051',
 'GO:0140738',
 'GO:0055052',
 'GO:0009324',
 'GO:0002947',
 'GO:0061645',
 'GO:0044383',
 'GO:00314

In [189]:
print("Nodes before pruning:", len(G.nodes)) # 24k

Nodes before pruning: 30575


In [190]:
# change graph nodes to _
G = nx.relabel_nodes(G, lambda x: x.replace(":", "_"))

In [191]:
list(nx.topological_sort(G))

['GO_root',
 'GO_0008150',
 'GO_0005575',
 'GO_0002376',
 'GO_0008152',
 'GO_0009987',
 'GO_0016032',
 'GO_0022414',
 'GO_0032501',
 'GO_0032502',
 'GO_0040007',
 'GO_0040011',
 'GO_0042592',
 'GO_0043473',
 'GO_0044419',
 'GO_0044848',
 'GO_0048511',
 'GO_0050896',
 'GO_0051179',
 'GO_0051703',
 'GO_0065007',
 'GO_0032991',
 'GO_0044423',
 'GO_0110165',
 'GO_0002252',
 'GO_0019882',
 'GO_0006518',
 'GO_0009056',
 'GO_0009058',
 'GO_0009308',
 'GO_0009812',
 'GO_0009820',
 'GO_0015977',
 'GO_0016999',
 'GO_0018884',
 'GO_0018930',
 'GO_0018933',
 'GO_0019330',
 'GO_0019499',
 'GO_0019748',
 'GO_0032259',
 'GO_0032963',
 'GO_0033013',
 'GO_0042430',
 'GO_0042440',
 'GO_0042537',
 'GO_0042558',
 'GO_0042620',
 'GO_0042726',
 'GO_0043170',
 'GO_0043603',
 'GO_0044238',
 'GO_0044281',
 'GO_0045730',
 'GO_0046453',
 'GO_0046484',
 'GO_0050898',
 'GO_0052803',
 'GO_0070085',
 'GO_0070988',
 'GO_0071941',
 'GO_0072521',
 'GO_0072524',
 'GO_0072527',
 'GO_0097164',
 'GO_0120246',
 'GO_0120252'

In [192]:
len(G.nodes)

30575

In [136]:
for u, v in G.edges("GO_0005575"):
    print(u,v)

In [193]:
# prune network, remove all go terms, which are leaf nodes but have no associated snps
def prune_tree(G, nodeset):
    nodes_to_check = list(nx.topological_sort(G))
    while nodes_to_check:
        node = nodes_to_check.pop()
        #print(node)
        # if it is a leaf, in this case it has no out edges, but not in go_terms, so no associations
        if  G.out_degree(node) == 0 and (not node in nodeset): 
            if G.in_degree(node) == 0:
                print("Node with no parents", node)
            parents = list(G.predecessors(node))
            G.remove_node(node)
            nodes_to_check.extend(parents)

prune_tree(G, go_terms)

print("Number of nodes left", len(G.nodes))

Number of nodes left 16934


In [194]:
# to undirected and number of components
uG = G.to_undirected()
print("Number of components", nx.number_connected_components(uG))
uG.edges

Number of components 1


EdgeView([('GO_0048311', 'GO_0048312'), ('GO_0048311', 'GO_0160040'), ('GO_0048311', 'GO_0007005'), ('GO_0048311', 'GO_0051646'), ('GO_0048308', 'GO_0048313'), ('GO_0048308', 'GO_0006996'), ('GO_0000002', 'GO_0032042'), ('GO_0000002', 'GO_0007005'), ('GO_0007005', 'GO_0000266'), ('GO_0007005', 'GO_0007006'), ('GO_0007005', 'GO_0008053'), ('GO_0007005', 'GO_0008637'), ('GO_0007005', 'GO_0010821'), ('GO_0007005', 'GO_0030382'), ('GO_0007005', 'GO_0033108'), ('GO_0007005', 'GO_0033615'), ('GO_0007005', 'GO_0035694'), ('GO_0007005', 'GO_0097250'), ('GO_0007005', 'GO_0006996'), ('GO_0007033', 'GO_0044088'), ('GO_0007033', 'GO_0080171'), ('GO_0007033', 'GO_1905037'), ('GO_0007033', 'GO_0006996'), ('GO_0000012', 'GO_1903516'), ('GO_0000012', 'GO_0006281'), ('GO_0006281', 'GO_0000725'), ('GO_0006281', 'GO_0000731'), ('GO_0006281', 'GO_0006282'), ('GO_0006281', 'GO_0006284'), ('GO_0006281', 'GO_0006289'), ('GO_0006281', 'GO_0006290'), ('GO_0006281', 'GO_0006298'), ('GO_0006281', 'GO_0006301'), 

In [20]:
"GO_0005575" in G.nodes

True

In [139]:
x = list(nx.topological_sort(G))
x

['GO_0008150',
 'GO_0002376',
 'GO_0008152',
 'GO_0009987',
 'GO_0016032',
 'GO_0022414',
 'GO_0032501',
 'GO_0032502',
 'GO_0040007',
 'GO_0040011',
 'GO_0042592',
 'GO_0043473',
 'GO_0044419',
 'GO_0048511',
 'GO_0050896',
 'GO_0051179',
 'GO_0051703',
 'GO_0065007',
 'GO_0098754',
 'GO_0002200',
 'GO_0002252',
 'GO_0002339',
 'GO_0002507',
 'GO_0019882',
 'GO_0045058',
 'GO_0090713',
 'GO_0006518',
 'GO_0009056',
 'GO_0009058',
 'GO_0009308',
 'GO_0009812',
 'GO_0009820',
 'GO_0016999',
 'GO_0019748',
 'GO_0032259',
 'GO_0032963',
 'GO_0033013',
 'GO_0042430',
 'GO_0042440',
 'GO_0042537',
 'GO_0042558',
 'GO_0042726',
 'GO_0043170',
 'GO_0043603',
 'GO_0044238',
 'GO_0044281',
 'GO_0045730',
 'GO_0051189',
 'GO_0052803',
 'GO_0070085',
 'GO_0070988',
 'GO_0071941',
 'GO_0072521',
 'GO_0072524',
 'GO_0072527',
 'GO_0097164',
 'GO_0120252',
 'GO_0120254',
 'GO_0140651',
 'GO_1900619',
 'GO_1901135',
 'GO_1901160',
 'GO_1901334',
 'GO_1901376',
 'GO_1901615',
 'GO_1990845',
 'GO_20010

In [79]:
x.pop()

'GO:0005575'

In [96]:
G.out_degree('GO_1990476')

OutDegreeView({})

In [26]:
list(nx.topological_sort(G))[-5:]

['GO_0031974', 'GO_0033643', 'GO_0043226', 'GO_0018995', 'GO_0110165']

In [98]:
# get edges of 'GO:0005575'
G.out_degree("GO_0005575")

2

In [48]:
G.nodes["GO_0005575"]

{'term_name': 'cellular_component',
 'namespace': 'cellular_component',
 'is_obsolete': False}

In [99]:
G.out_degree("GO_0005575")

2

In [141]:
list(nx.topological_sort(G))

['GO_0008150',
 'GO_0002376',
 'GO_0008152',
 'GO_0009987',
 'GO_0016032',
 'GO_0022414',
 'GO_0032501',
 'GO_0032502',
 'GO_0040007',
 'GO_0040011',
 'GO_0042592',
 'GO_0043473',
 'GO_0044419',
 'GO_0048511',
 'GO_0050896',
 'GO_0051179',
 'GO_0051703',
 'GO_0065007',
 'GO_0098754',
 'GO_0002200',
 'GO_0002252',
 'GO_0002339',
 'GO_0002507',
 'GO_0019882',
 'GO_0045058',
 'GO_0090713',
 'GO_0006518',
 'GO_0009056',
 'GO_0009058',
 'GO_0009308',
 'GO_0009812',
 'GO_0009820',
 'GO_0016999',
 'GO_0019748',
 'GO_0032259',
 'GO_0032963',
 'GO_0033013',
 'GO_0042430',
 'GO_0042440',
 'GO_0042537',
 'GO_0042558',
 'GO_0042726',
 'GO_0043170',
 'GO_0043603',
 'GO_0044238',
 'GO_0044281',
 'GO_0045730',
 'GO_0051189',
 'GO_0052803',
 'GO_0070085',
 'GO_0070988',
 'GO_0071941',
 'GO_0072521',
 'GO_0072524',
 'GO_0072527',
 'GO_0097164',
 'GO_0120252',
 'GO_0120254',
 'GO_0140651',
 'GO_1900619',
 'GO_1901135',
 'GO_1901160',
 'GO_1901334',
 'GO_1901376',
 'GO_1901615',
 'GO_1990845',
 'GO_20010

In [17]:
list(nx.topological_sort(G))[-5:]

['GO_0033643', 'GO_0043226', 'GO_0018995', 'GO_0110165', 'GO_root']

In [195]:
len([n for n in G.nodes if G.in_degree(n) == 0])

1

# Build full ontology

In [196]:
# Convert the pruned graph to a dataframe for output
pruned_edges = [(u,v) for u, v in G.edges]
ontology_df = pd.DataFrame(pruned_edges, columns=['parent', 'child'])
ontology_df["type"] = "default"
ontology_df

# append gene to go term df
tmp_df = merged_go[["index","gene_name"]].drop_duplicates()
tmp_df['index'] = tmp_df['index'].str.replace(":","_")
tmp_df.columns = ['parent', 'child']
tmp_df['type'] = 'gene'

# append snp to gene df
snp_df = go_bim[["gene_name","snp"]]
snp_df.columns = ['parent', 'child']
snp_df['type'] = 'snp'

# concatenate
ontology_df = pd.concat([ontology_df, tmp_df, snp_df])
ontology_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df['type'] = 'snp'


Unnamed: 0,parent,child,type
0,GO_0048311,GO_0048312,default
1,GO_0048311,GO_0160040,default
2,GO_0048308,GO_0048313,default
3,GO_0000002,GO_0032042,default
4,GO_0007005,GO_0000002,default
...,...,...,...
695636,PASD1,rs58448897,snp
695637,GABRE,rs145754054,snp
695638,TREX2,rs5945389,snp
695639,SRPK3,rs199915647,snp


In [197]:
"GO_root" in list(ontology_df.parent) # check if root is in there

True

In [198]:
# Create a directed graph from the ontology data
OG = nx.DiGraph()

# Add edges to the graph
for index, row in ontology_df.iterrows():
    OG.add_edge(row['parent'], row['child'], type=row['type']) # only add gene if in known genes

print("Nodes in Graph:", len(OG.nodes))
print("Edges in Graph:", len(OG.edges))

#print(G.nodes)

# Step 2: Iteratively remove nodes that do not lead to a known gene
def prune_tree_(G):
    nodes_to_check = list(G.nodes)
    while nodes_to_check:
        node = nodes_to_check.pop()
        if G.out_degree(node) == 0 and G.in_degree(node) > 0 and "GO" in node:  # Check if it is a hanging node
            parents = list(G.predecessors(node))
            G.remove_node(node)
            nodes_to_check.extend(parents)

prune_tree_(OG)

print("Number of nodes left", len(OG.nodes))

# Convert the pruned graph back to a dataframe for output
pruned_edges = [(u, v, OG[u][v]['type']) for u, v in OG.edges]
pruned_df = pd.DataFrame(pruned_edges, columns=['parent', 'child', 'type'])

# Save the pruned ontology
#pruned_ontology_file = '../data/go_biological_processes.txt'
pruned_ontology_file = '../data/go_bp_cc_isa_partof_reg.txt'
pruned_df.to_csv(pruned_ontology_file, sep='\t', index=False, header=False)

pruned_df.head()

Nodes in Graph: 730163
Edges in Graph: 905851
Number of nodes left 730163


Unnamed: 0,parent,child,type
0,GO_0048311,GO_0048312,default
1,GO_0048311,GO_0160040,default
2,GO_0048311,MEF2A,gene
3,GO_0048311,HAP1,gene
4,GO_0048311,TRAK2,gene


In [38]:
pruned_df

Unnamed: 0,parent,child,type
0,GO_0031985,GO_0000137,default
1,GO_0031985,GO_0000138,default
2,GO_0031985,GO_0005797,default
3,GO_0031985,STX16,gene
4,GO_0031985,SORL1,gene
...,...,...,...
716316,RPL18,rs12608448,snp
716317,HOXA3,rs118016485,snp
716318,TAF6,rs143557149,snp
716319,TPRN,rs144769563,snp


In [75]:
# OG
list(nx.topological_sort(OG))

['GO_root',
 'GO_0005874',
 'GO_0042622',
 'GO_0005884',
 'GO_0031234',
 'GO_0042383',
 'GO_0062023',
 'GO_0005682',
 'GO_0071007',
 'GO_0071013',
 'GO_1990877',
 'GO_0071001',
 'GO_0005879',
 'GO_0036126',
 'GO_0042470',
 'GO_0002096',
 'GO_0005881',
 'GO_0034709',
 'GO_0005686',
 'GO_0071005',
 'GO_0071011',
 'GO_0001533',
 'GO_0005882',
 'GO_0008091',
 'GO_0072534',
 'GO_0000221',
 'GO_0031941',
 'GO_0005883',
 'GO_0030686',
 'GO_0032040',
 'GO_0005581',
 'GO_0005600',
 'GO_0015935',
 'GO_0044391',
 'GO_0097431',
 'GO_0005838',
 'GO_0005905',
 'GO_0030117',
 'GO_0005762',
 'GO_0005915',
 'GO_0008282',
 'GO_0097342',
 'GO_1990498',
 'GO_0097729',
 'GO_0098850',
 'GO_1990909',
 'GO_0045095',
 'GO_0097539',
 'GO_0030688',
 'GO_0099147',
 'GO_0008180',
 'GO_0098890',
 'GO_0005587',
 'GO_0000145',
 'GO_0034045',
 'GO_0005685',
 'GO_0005687',
 'GO_0071004',
 'GO_0001527',
 'GO_0016591',
 'GO_0005862',
 'GO_0097524',
 'GO_0033018',
 'GO_0030125',
 'GO_0005763',
 'GO_0036157',
 'GO_0034361'

In [None]:
OG.out_edges("")

In [60]:
OG.in_edges('GO_0033180')

InEdgeDataView([])

In [62]:
len([n for n in OG.nodes if OG.in_degree(n) == 0])

141

In [63]:
len([n for n in G.nodes if G.out_degree(n) == 0])

268

In [1]:
import pandas as pd 

In [4]:
pd.read_csv("../data/go_cellular_components.txt", sep="\t")#.to_csv("../data/go_cellular_components.txt", sep='\t', index=False, header=False)

Unnamed: 0,GO_0000137,GO_0031985,default
0,GO_0000137,GOLGA8J,gene
1,GO_0000137,GOLGA6A,gene
2,GO_0000137,XYLT1,gene
3,GO_0000137,ATL1,gene
4,GO_0000137,GOLGA8K,gene
...,...,...,...
716629,RPL18,rs12608448,snp
716630,HOXA3,rs118016485,snp
716631,TAF6,rs143557149,snp
716632,TPRN,rs144769563,snp
