In [13]:
# #To be used on cloud platforms

# ! python -c "import pykeen" || pip install pykeen
# ! python -c "import scprep" || pip install scprep

In [14]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
import networkx, scprep, magic, torch, pykeen
import rpy2, rpy2.situation
from rpy2.robjects import r, pandas2ri

In [15]:
print(os.getcwd())
data_dir = f"{os.getcwd()}/data"
kge_dir = f"{os.getcwd()}/KGEs"

/gpfs/ysm/project/krishnaswamy_smita/fc489/yaleCollab


Building a KG based on the NicheNet LigandReceptor-Signalling dataset (within their hosted weighted_networks.rds object).

The data is presented as weighted edges, so we can use the weights if using the graph directly or set a cutoff value if building a KG (unweighted).


# NicheNet KnowledgeGraph

In [16]:
for i in rpy2.situation.iter_info(): # Print Rpy2 info
    print(i)

pandas2ri.activate()
%load_ext rpy2.ipython

[1mrpy2 version:[0m
3.5.1
[1mPython version:[0m
3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:58:50) 
[GCC 10.3.0]
[1mLooking for R's HOME:[0m
    Environment variable R_HOME: /gpfs/ysm/project/krishnaswamy_smita/fc489/conda_envs/collab/lib/R
    Calling `R RHOME`: /gpfs/ysm/project/krishnaswamy_smita/fc489/conda_envs/collab/lib/R
    Environment variable R_LIBS_USER: None
[1mR's additions to LD_LIBRARY_PATH:[0m

[1mR version:[0m
    In the PATH: R version 4.1.3 (2022-03-10) -- "One Push-Up"
    Loading R library from rpy2: OK
[1mAdditional directories to load R packages from:[0m
None
[1mC extension compilation:[0m
  include:
  ['/gpfs/ysm/project/krishnaswamy_smita/fc489/conda_envs/collab/lib/R/include']
  libraries:
  ['R', 'pcre2-8', 'lzma', 'bz2', 'z', 'rt', 'dl', 'm', 'iconv', 'icuuc', 'icui18n']
  library_dirs:
  ['/gpfs/ysm/project/krishnaswamy_smita/fc489/conda_envs/collab/lib', '/gpfs/ysm/project/krishnaswamy_smita/fc489/conda_envs/collab/lib/R/lib', 

In [17]:
%%R -o dNNlr,dNNs,dNNgr
#Directly load RDS object from NicheNet's zenodo repo
dNNlr = readRDS(url("https://zenodo.org/record/3260758/files/lr_network.rds"))
dNNs = readRDS(url("https://zenodo.org/record/3260758/files/signaling_network.rds"))
dNNgr = readRDS(url("https://zenodo.org/record/3260758/files/gr_network.rds"))

In [18]:
print(dNNlr.describe())
print(dNNs.describe())
print(dNNgr.describe())

dNNlr = pd.DataFrame({
            "head":dNNlr["from"],
            "relation":np.repeat("LR", dNNlr.shape[0]),
            "tail":dNNlr["to"]
        })
dNNs = pd.DataFrame({
            "head":dNNs["from"],
            "relation":np.repeat("SIG", dNNs.shape[0]),
            "tail":dNNs["to"]
        })
dNNgr = pd.DataFrame({
            "head":dNNgr["from"],
            "relation":np.repeat("GR", dNNgr.shape[0]),
            "tail":dNNgr["to"]
        })

         from     to  source        database
count   12651  12651   12651           12651
unique    688    857      14               5
top       AGT   FPR2  ppi_lr  ppi_prediction
freq      192     79    5956            6304
           from       to                   source                  database
count   3621987  3621987                  3621987                   3621987
unique    18550    18068                       23                         7
top         UBC      UBC  inweb_inbio_interaction  pathwaycommons_signaling
freq      12521    13086                   813050                   1562898
           from       to              source        database
count   3592299  3592299             3592299         3592299
unique     4486    25103                  20               8
top         YY1   CDKN1A  harmonizome_ENCODE  harmonizome_gr
freq      47472      921             1564116         2953691


Load in pathway data from Ensembl2Reactome file. 
We will use this data to add new triples (named pathway) from REactome IDs to human gene symbols.

We're getting ensembl peptide IDs as we want to annotate human gene symbols that correspond to proteins.

In [19]:
from biomart import BiomartServer

#Load reactome database
dPathEnsR = pd.read_csv(f"{data_dir}/DBs/Ensembl2Reactome.txt", sep="\t", header=None)
dPathEnsR = dPathEnsR.loc[dPathEnsR[5]=="Homo sapiens"][[0,1]].drop_duplicates()

#Biomart section
server = BiomartServer("http://useast.ensembl.org/biomart")
server.verbose = True
mart = server.datasets["hsapiens_gene_ensembl"]
response = mart.search({"attributes":["ensembl_peptide_id","hgnc_symbol"]})
#Translation
transDF = []
for line in response.raw.data.decode("utf-8").splitlines():
    line = line.split("\t")
    if len(line[0]) > 0: #Only get gene symbols with ENSP ID
        transDF.append(line)
transDF = pd.DataFrame(transDF, 
                columns=["ensembl_peptide_id","hgnc_symbol"]
                ).drop_duplicates() 
transDict = transDF.set_index("ensembl_peptide_id"
                ).to_dict(
                    #Transform DF into a dictionary but first set index
                )["hgnc_symbol"]#Nested dict, so get first (and only item)

dPathEnsR[0] = dPathEnsR[0].map(transDict)
dPathEnsR = dPathEnsR.dropna().drop_duplicates()
dPathEnsR = pd.DataFrame({
            "head":dPathEnsR[1],
            "relation":np.repeat("pathway", dPathEnsR.shape[0]),
            "tail":dPathEnsR[0]
        })

[BiomartServer:'http://useast.ensembl.org/biomart/martservice'] Fetching datasets
[BiomartServer:'http://useast.ensembl.org/biomart/martservice'] Fetching databases
[BiomartDatabase:'Ensembl Genes 107'] Fetching datasets
[BiomartDatabase:'Mouse strains 107'] Fetching datasets
[BiomartDatabase:'Sequence'] Fetching datasets
[BiomartDatabase:'Ontology'] Fetching datasets
[BiomartDatabase:'Genomic features 107'] Fetching datasets
[BiomartDatabase:'Ensembl Variation 107'] Fetching datasets
[BiomartDatabase:'Ensembl Regulation 107'] Fetching datasets
[BiomartDataset:'hsapiens_gene_ensembl'] Searching using following params:
{'attributes': ['ensembl_peptide_id', 'hgnc_symbol']}
[BiomartDataset:'hsapiens_gene_ensembl'] Fetching attributes
[BiomartDataset] search query:
b'<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" datasetConfigVersion="0.6" count=""><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_peptide_id" /><Attribute name=

In [39]:
#Add in pathway metadata info: Dict with pathway name and Reactome PE IDs belonging to it
#Kepp in mind that not all of them are PTMs, some are just the protein
dPathMETA = pd.read_csv(f"{data_dir}/DBs/Ensembl2Reactome_PE_Pathway.txt", sep="\t", header=None)
dPathMETA = dPathMETA.loc[
    dPathMETA[7]=="Homo sapiens"][[1,2,3,5]].drop_duplicates().rename(
        columns = {1:"PE",2:"PE_name",3:"pathway",5:"pathway_name"})
dPathMETA = dPathMETA.loc[dPathMETA["pathway"].isin(dPathEnsR["head"])].reset_index(drop=True)
dPathMETA

Unnamed: 0,PE,PE_name,pathway,pathway_name
0,R-HSA-162865,DPM1 [endoplasmic reticulum membrane],R-HSA-162699,Synthesis of dolichyl-phosphate mannose
1,R-HSA-4717383,DPM1 G111Lfs*45 [endoplasmic reticulum membrane],R-HSA-4717374,Defective DPM1 causes DPM1-CDG
2,R-HSA-4717361,DPM1 Q210Rfs*4 [endoplasmic reticulum membrane],R-HSA-4717374,Defective DPM1 causes DPM1-CDG
3,R-HSA-4717370,DPM1 R92G [endoplasmic reticulum membrane],R-HSA-4717374,Defective DPM1 causes DPM1-CDG
4,R-HSA-162865,DPM1 [endoplasmic reticulum membrane],R-HSA-4719360,Defective DPM3 causes DPM3-CDG
...,...,...,...,...
71351,R-HSA-3209107,p14ARF mRNA [cytosol],R-HSA-8951936,RUNX3 regulates p14-ARF
71352,R-HSA-8848195,U4atac snRNA [nucleoplasm],R-HSA-6807505,RNA polymerase II transcribes snRNA genes
71353,R-HSA-8848195,U4atac snRNA [nucleoplasm],R-HSA-72165,mRNA Splicing - Minor Pathway
71354,R-HSA-428380,MYC mRNA [cytosol],R-HSA-428359,Insulin-like Growth Factor-2 mRNA Binding Prot...


In [61]:
# print(len(dPathMETA["PE"].unique()))
# print(len(dPathMETA["PE_name"].unique()))
# print(len(dPathMETA["pathway"].unique()))
# print(len(dPathMETA["pathway_name"].unique()))

# df = dPathMETA[["pathway","pathway_name"]].drop_duplicates()

# dPathMDdict = {}
# for i in dPathMETA.iterrows():
#     if i[1]["pathway"] not in dPathMDdict.keys():
#         dPathMDdict[i[1]["pathway"]] = i[1]["pathway_name"]
#         dPathMDdict["PEdict"] = {i[1]["PE"]:i[1]["PE_name"]}
# print(dPathMDdict)

# for name, group in dPathMETA.groupby(by="pathway"):
#     print(name)
#     print(group.to_dict("records"))

30115
30055
2065
2052
{'R-HSA-162699': 'Synthesis of dolichyl-phosphate mannose', 'PEdict': {'R-HSA-3229123': 'SLC37A4 G20D [endoplasmic reticulum membrane]'}, 'R-HSA-4717374': 'Defective DPM1 causes DPM1-CDG', 'R-HSA-4719360': 'Defective DPM3 causes DPM3-CDG', 'R-HSA-4719377': 'Defective DPM2 causes DPM2-CDG', 'R-HSA-2029481': 'FCGR activation', 'R-HSA-432142': 'Platelet sensitization by LDL', 'R-HSA-6798695': 'Neutrophil degranulation', 'R-HSA-9664323': 'FCGR3A-mediated IL10 synthesis', 'R-HSA-9664422': 'FCGR3A-mediated phagocytosis', 'R-HSA-977606': 'Regulation of Complement cascade', 'R-HSA-381426': 'Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)', 'R-HSA-8957275': 'Post-translational protein phosphorylation', 'R-HSA-174403': 'Glutathione synthesis and recycling', 'R-HSA-5578999': 'Defective GCLC causes HAGGSD', 'R-HSA-9759194': 'Nuclear events mediated by NFE2L2', 'R-HSA-1989781': 'PPARA activates gene ex

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Create triples dataset

Build final set of triples.
To make pykeen object, use from_labelled_triples() and load data columns of dKGnn as numpy and pass the metadata column (for pathways) as metadata arg.

In [None]:
dKGnn = pd.concat([dNNlr, dNNs, dNNgr, dPathEnsR]).drop_duplicates().reset_index(drop=True)
del dNNlr,dNNs,dNNgr,dPathEnsR
dKGnn