In [1]:
from goatools import obo_parser
from zipfile import ZipFile

import os
import gzip
import graco
import shutil
import requests
import numpy as np
import pandas as pd
import networkx as nx
import Bio.UniProt.GOA as GOA

In [2]:
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
HUMAN_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/human"
NETWORK_DIRECTORY = f"{HUMAN_DIRECTORY}/networks"
ANNOTATION_DIRECTORY = f"{HUMAN_DIRECTORY}/annotations"

# Downloads

## BioGRID

In [5]:
if not os.path.exists(RAW_DATA_DIRECTORY):
     os.makedirs(RAW_DATA_DIRECTORY)

VERSION = "3.5.184"
BioGRID_FILENAME = f"BIOGRID-ORGANISM-{VERSION}.tab3.zip"
BioGRID_URL = f"https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-{VERSION}"
BioGRID_FILEPATH = f"{RAW_DATA_DIRECTORY}/{BioGRID_FILENAME}" 

In [23]:
# Download
r = requests.get(f"{BioGRID_URL}/{BioGRID_FILENAME}", allow_redirects=True)
with open(BioGRID_FILEPATH, 'wb') as f:
    f.write(r.content)

### Select H. Sapiens

In [6]:
# Unzip Homo Sapiens file
with ZipFile(BioGRID_FILEPATH, 'r') as z:
    BioGRID_sc_FILENAME, = [name for name in z.namelist() if 'sapiens' in name.lower()]
    z.extract(BioGRID_sc_FILENAME, RAW_DATA_DIRECTORY)

### Human annotations (EBI)

In [7]:
EBI_FILENAME = "goa_human.gaf.gz"
EBI_URL = "http://geneontology.org/gene-associations"
EBI_FILEPATH = f"{RAW_DATA_DIRECTORY}/{EBI_FILENAME}"

In [8]:
# Download
r = requests.get(f"{EBI_URL}/{EBI_FILENAME}", allow_redirects=True)
with open(EBI_FILEPATH, 'wb') as f:
    f.write(r.content)

### GO

In [27]:
GO_FILENAME = "go.obo"
GO_URL = "http://purl.obolibrary.org/obo/go"
GO_FILEPATH = f"{RAW_DATA_DIRECTORY}/{GO_FILENAME}"

In [28]:
# Download
r = requests.get(f"{GO_URL}/{GO_FILENAME}", allow_redirects=True)
with open(GO_FILEPATH, 'wb') as f:
    f.write(r.content)

# Full PPI (BioGRID)

### Load BioGRID as dataframe

In [9]:
VERSION = "3.5.184"

In [11]:
VERSION = "3.5.184"

# load BioGRID file as dataframe
organism_FILENAME = f"BIOGRID-ORGANISM-Homo_sapiens-{VERSION}.tab3.txt"
organism_FILEPATH = f"{RAW_DATA_DIRECTORY}/{organism_FILENAME}"

# remove entrez_id column and score_column because of mixed datatypes 
BioGRID_df = pd.read_csv(organism_FILEPATH, delimiter='\t',
                           usecols=[index for index in range(24) if index not in {1,2,18}])

### Flilter for evidence code

In [12]:
# Filter for (reliable) physical interactions
EXPERIMENTAL_SYSTEM = {'Two-hybrid', 
                       'Affinity Capture-Luminescence',
                       'Affinity Capture-MS', 
                       'Affinity Capture-RNA', 
                       'Affinity Capture-Western'}

physical_interaction_df = BioGRID_df[BioGRID_df['Experimental System'].isin(EXPERIMENTAL_SYSTEM)]

# Check if all evidence codes measure physical interactions
EXPERIMENTAL_SYSTEM_TYPE = {'physical'} 
assert physical_interaction_df['Experimental System Type'].isin(EXPERIMENTAL_SYSTEM_TYPE).all()

### Define PPI

In [13]:
# Reduce PPI to simple network
PPI_nx = nx.from_pandas_edgelist(physical_interaction_df,
                                 'BioGRID ID Interactor A', 
                                 'BioGRID ID Interactor B')
PPI_nx.remove_edges_from(nx.selfloop_edges(PPI_nx))
PPI_nx = nx.induced_subgraph(PPI_nx, max(nx.connected_components(PPI_nx), key=len))

### Summary

In [14]:
print(f"PPI nodes  : {PPI_nx.number_of_nodes():>6d}")
print(f"PPI edges  : {PPI_nx.number_of_edges():>6d}")
print(f"PPI density: {nx.density(PPI_nx):.4f}")

PPI nodes  :  22573
PPI edges  : 321276
PPI density: 0.0013


### Save

In [15]:
if not os.path.exists(NETWORK_DIRECTORY):
    os.makedirs(NETWORK_DIRECTORY)

nx.write_edgelist(PPI_nx, f"{NETWORK_DIRECTORY}/full_PPI_BioGRID.txt", data=False)

# Official PPI (BioGRID)

In [17]:
df = pd.read_csv(f"{RAW_DATA_DIRECTORY}/BIOGRID-IDENTIFIERS-3.5.184.tab.txt", 
                 header=20, 
                 delimiter='\t')
df = df[df.ORGANISM_OFFICIAL_NAME == 'Homo sapiens']
df.to_csv(f"{RAW_DATA_DIRECTORY}/BIOGRID_HS_IDENTIFIERS-3.5.184.csv")

### Define official dictionary

In [19]:
identifier_df = pd.read_csv(f"{RAW_DATA_DIRECTORY}/BIOGRID_HS_IDENTIFIERS-{VERSION}.csv", index_col=0)
identifier_df = identifier_df[identifier_df.IDENTIFIER_TYPE == 'OFFICIAL SYMBOL']
identifier_s = pd.Series(identifier_df.IDENTIFIER_VALUE.values, index=identifier_df.BIOGRID_ID)

  mask |= (ar1 == a)


### Define official PPI

In [24]:
PPI_nx = nx.read_edgelist(f"{NETWORK_DIRECTORY}/full_PPI_BioGRID.txt", nodetype=int)

off_PPI_nx = nx.relabel_nodes(PPI_nx, identifier_s)

nodes = filter(lambda x:isinstance(x,str), off_PPI_nx.nodes())
off_PPI_nx = nx.induced_subgraph(off_PPI_nx, nodes)
off_PPI_nx = nx.induced_subgraph(off_PPI_nx,
                                 max(nx.connected_components(off_PPI_nx), 
                                     key=len))

### Summary

In [25]:
print(f"    full PPI nodes  : {PPI_nx.number_of_nodes():>6d}")
print(f"    full PPI edges  : {PPI_nx.number_of_edges():>6d}")
print(f"    full PPI density: {nx.density(PPI_nx):.4f}")
print('------------------------------')
print(f"official PPI nodes  : {off_PPI_nx.number_of_nodes():>6d}")
print(f"official PPI edges  : {off_PPI_nx.number_of_edges():>6d}")
print(f"official PPI density: {nx.density(off_PPI_nx):.4f}")

    full PPI nodes  :  22573
    full PPI edges  : 321276
    full PPI density: 0.0013
------------------------------
official PPI nodes  :  17245
official PPI edges  : 294344
official PPI density: 0.0020


### Save

In [26]:
if not os.path.exists(NETWORK_DIRECTORY):
    os.makedirs(NETWORK_DIRECTORY)

nx.write_edgelist(off_PPI_nx, f"{NETWORK_DIRECTORY}/official_PPI_BioGRID.txt", data=False)

---

## Old code

In [31]:
# BioGRID universe is the collection of every gene known to BioGRID
universe = set(BioGRID_df['BioGRID ID Interactor A']) | \
           set(BioGRID_df['BioGRID ID Interactor B'])

symbol_universe = set(BioGRID_df['Official Symbol Interactor A']) | \
                  set(BioGRID_df['Official Symbol Interactor B'])

In [32]:
# Filter for (reliable) physical interactions
EXPERIMENTAL_SYSTEM = {'Two-hybrid', 
                       'Affinity Capture-Luminescence',
                       'Affinity Capture-MS', 
                       'Affinity Capture-RNA', 
                       'Affinity Capture-Western'}
EXPERIMENTAL_SYSTEM_TYPE = {'physical'} # redundant because of experimental evidence filtering

physical_interaction_df = BioGRID_df[BioGRID_df['Experimental System'].isin(EXPERIMENTAL_SYSTEM)]
physical_interaction_df = physical_interaction_df[
    physical_interaction_df['Experimental System Type'].isin(EXPERIMENTAL_SYSTEM_TYPE)] # just in case...


physical_universe = set(physical_interaction_df['BioGRID ID Interactor A']) | \
                    set(physical_interaction_df['BioGRID ID Interactor B'])

symbol_physical_universe = set(physical_interaction_df['Official Symbol Interactor A']) | \
                           set(physical_interaction_df['Official Symbol Interactor B'])

### Define PPI

In [33]:
# PPI in dataframe
PPI_df = physical_interaction_df[
    (physical_interaction_df['Organism Interactor A'] == 9606) & \
    (physical_interaction_df['Organism Interactor B'] == 9606)]

PPI_universe = set(PPI_df['Official Symbol Interactor A']) | \
               set(PPI_df['Official Symbol Interactor B'])

In [34]:
# Reduce PPI to simple network
PPI_nx = nx.from_pandas_edgelist(PPI_df,'Official Symbol Interactor A', 'Official Symbol Interactor B')
PPI_nx.remove_edges_from(nx.selfloop_edges(PPI_nx))

### Summary

In [35]:
print("BioGRID universe sizes:")
print("=========================")
print(f"  -       -    : {len(universe)}")
print(f"symbol    -    : {len(symbol_universe)}")
print(f"  -    physical: {len(physical_universe)}")
print(f"symbol physical: {len(symbol_physical_universe)}")
print("-------------------------")
print(f"PPI population size: {PPI_nx.number_of_nodes()}")

BioGRID universe sizes:
  -       -    : 24277
symbol    -    : 23416
  -    physical: 22332
symbol physical: 21799
-------------------------
PPI population size: 17137


### Save

In [38]:
if not os.path.exists(NETWORK_DIRECTORY):
    os.makedirs(NETWORK_DIRECTORY)
    
nx.write_edgelist(PPI_nx, f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt", data=False)

# Annotations

## EBI (H. Sapiens)

In [3]:
EBI_FILENAME = "goa_human.gaf.gz"
EBI_FILEPATH = f"{RAW_DATA_DIRECTORY}/{EBI_FILENAME}"

# load SGD gaf-file as dataframe 
with gzip.open(EBI_FILEPATH, 'rt') as gz:
    EBI_df = pd.DataFrame(annotation for annotation in GOA.gafiterator(gz))

In [4]:
# Filter for proteins
lc_protein_gaf_df = EBI_df[EBI_df.DB_Object_Type == 'protein']

# Filter through evidence code
protein_gaf_df = lc_protein_gaf_df[lc_protein_gaf_df['Evidence'].isin(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'])]

# Split into the three GOs
protein_BP_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='P']
protein_MF_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='F']
protein_CC_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='C']

In [5]:
# Get rid of all unnecesarry columns in the GAFs
high_IC_annotations_df = protein_gaf_df[['DB_Object_Symbol', 'GO_ID']].dropna().drop_duplicates()

# Split into the three GOs
high_IC_BP_annotations_df = protein_BP_gaf_df[['DB_Object_Symbol', 'GO_ID']].dropna().drop_duplicates()
high_IC_MF_annotations_df = protein_MF_gaf_df[['DB_Object_Symbol', 'GO_ID']].dropna().drop_duplicates()
high_IC_CC_annotations_df = protein_CC_gaf_df[['DB_Object_Symbol', 'GO_ID']].dropna().drop_duplicates()

### GO DAG extention (Human2GO)

In [6]:
# Load obo and gaf files
GO_FILENAME = "go.obo"
GO_FILEPATH = f"{RAW_DATA_DIRECTORY}/{GO_FILENAME}"

# Create annotations with all upstream terms in dict
go_dag = obo_parser.GODag(GO_FILEPATH)
go2parents = {go_id:{*go_dag[go_id].get_all_parents(), go_id} for go_id in go_dag.keys()}

/Users/markusyoussef/Desktop/git/supplements/data/raw_data/go.obo: fmt(1.2) rel(2020-04-23) 47,239 GO Terms


In [7]:
# Create full list/dataframe of annotations
all_annotations_list = [(gene_id, go_term, go_dag[go_term].level) 
                                for _, (gene_id, go_id) in high_IC_annotations_df.iterrows()
                                    for go_term in go2parents[go_id]]
all_annotations_df = pd.DataFrame(
                            all_annotations_list,  
                            columns = ['DB_Object_Symbol', 'GO_ID', 'Level']).drop_duplicates()

# Split into the three GOs
all_BP_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "biological_process"]
all_MF_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "molecular_function"]
all_CC_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "cellular_component"]

all_BP_annotations_df = pd.DataFrame(
                            all_BP_annotations_list,  
                            columns = ['DB_Object_Symbol', 'GO_ID', 'Level']).drop_duplicates()
all_MF_annotations_df = pd.DataFrame(
                            all_MF_annotations_list,  
                            columns = ['DB_Object_Symbol', 'GO_ID', 'Level']).drop_duplicates()
all_CC_annotations_df = pd.DataFrame(
                            all_CC_annotations_list,  
                            columns = ['DB_Object_Symbol', 'GO_ID', 'Level']).drop_duplicates()

### Summary

In [8]:
print("EBI universe sizes:")
print("============================================")
print("Biological Process : " 
    f"{len(set(map(lambda x:x[0],all_BP_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_BP_annotations_list)))} GO-IDs ")
print("Molecular Functions: " 
    f"{len(set(map(lambda x:x[0],all_MF_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_MF_annotations_list)))} GO-IDs ")
print("Cellular Components: " 
    f"{len(set(map(lambda x:x[0],all_CC_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_CC_annotations_list)))}  GO-IDs ")
print('--------------------------------------------')
print("All annotations    : " 
    f"{len(set(map(lambda x:x[0],all_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_annotations_list)))} GO-IDs ")

EBI universe sizes:
Biological Process : 9172 genes, 11263 GO-IDs 
Molecular Functions: 12204 genes, 3590 GO-IDs 
Cellular Components: 10216 genes, 1439  GO-IDs 
--------------------------------------------
All annotations    : 14064 genes, 16292 GO-IDs 


### Save

In [10]:
if not os.path.exists(ANNOTATION_DIRECTORY):
    os.makedirs(ANNOTATION_DIRECTORY)

all_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_all_official_EBI.csv", index=False)

all_BP_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_BP_official_EBI.csv", index=False)
all_MF_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_MF_official_EBI.csv", index=False)
all_CC_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_CC_official_EBI.csv", index=False)

## BioGRID $\cap$ EBI

In [13]:
PPI_nx = nx.read_edgelist(f"{NETWORK_DIRECTORY}/official_PPI_BioGRID.txt")

all_annotations_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_all_official_EBI.csv")

all_BP_annotations_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_BP_official_EBI.csv")
all_MF_annotations_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_MF_official_EBI.csv")
all_CC_annotations_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_CC_official_EBI.csv")

In [14]:
PPI_annotations_df = all_annotations_df[all_annotations_df.DB_Object_Symbol.isin(PPI_nx.nodes)]

PPI_BP_annotations_df = all_BP_annotations_df[all_BP_annotations_df.DB_Object_Symbol.isin(PPI_nx.nodes)]
PPI_MF_annotations_df = all_MF_annotations_df[all_MF_annotations_df.DB_Object_Symbol.isin(PPI_nx.nodes)]
PPI_CC_annotations_df = all_CC_annotations_df[all_CC_annotations_df.DB_Object_Symbol.isin(PPI_nx.nodes)]

### Summary

In [15]:
print(r"EBI ∩ BioGRID universe sizes:")
print("============================================")
print("Biological Process : " 
    f"{len(set(PPI_BP_annotations_df.DB_Object_Symbol))} genes, "
    f"{len(set(PPI_BP_annotations_df.GO_ID))} GO-IDs ")
print("Molecular Functions: " 
    f"{len(set(PPI_MF_annotations_df.DB_Object_Symbol))} genes, "
    f"{len(set(PPI_MF_annotations_df.GO_ID))} GO-IDs ")
print("Cellular Components: " 
    f"{len(set(PPI_CC_annotations_df.DB_Object_Symbol))} genes, "
    f"{len(set(PPI_CC_annotations_df.GO_ID))}  GO-IDs ")
print('--------------------------------------------')
print("All annotations    : " 
    f"{len(set(PPI_annotations_df.DB_Object_Symbol))} genes, "
    f"{len(set(PPI_annotations_df.GO_ID))} GO-IDs ")

EBI ∩ BioGRID universe sizes:
Biological Process : 8186 genes, 10988 GO-IDs 
Molecular Functions: 11072 genes, 3452 GO-IDs 
Cellular Components: 9159 genes, 1417  GO-IDs 
--------------------------------------------
All annotations    : 12490 genes, 15857 GO-IDs 


### Save

In [16]:
PPI_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_all_official_BioGRID-EBI.csv", index=False)

PPI_BP_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_BP_official_BioGRID-EBI.csv", index=False)
PPI_MF_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_MF_official_BioGRID-EBI.csv", index=False)
PPI_CC_annotations_df.to_csv(f"{ANNOTATION_DIRECTORY}/GO_CC_official_BioGRID-EBI.csv", index=False)