In [11]:
import copy
import getpass
import rdflib

from collections import defaultdict

import numpy as np
import pandas as pd

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

from bluegraph import PandasPGFrame
from bluegraph.backends.neo4j import pgframe_to_neo4j

from bmo.ontologies import subontology_from_term
from bmo.kbs.gene_kbs import get_gene_data

In [12]:
ONTO_CLASS_LABEL = "ONTOLOGY_CLASS"

## Fetch protein interactions from Nexus

In [13]:
ENDPOINT = "https://bbp.epfl.ch/nexus/v1"
ORG = "bbp"
PROJ = "membranal-protein-interactions"

In [14]:
TOKEN = getpass.getpass()

········


In [15]:
forge = KnowledgeGraphForge("../../config/prod-forge-config.yml",
                            endpoint=ENDPOINT,
                            token=TOKEN,
                            bucket=f"{ORG}/{PROJ}")

In [16]:
interactions = forge.search({"type": "Interaction"}, limit=None)
interactions_df = forge.as_dataframe(interactions)

In [18]:
interactions_df.sample(5)

Unnamed: 0,id,type,name,participant


In [20]:
genes = forge.search({"type": "Gene"}, limit=None)

In [21]:
gene_df = forge.as_dataframe(genes)

In [22]:
proteins = forge.search({"type": "Protein"}, limit=None)
protein_df = forge.as_dataframe(proteins)

In [23]:
def get_participation_data(x, index):
    interaction_id = x.id
    participant_id = x["participant"][index]["protein"]["id"]
    data = copy.deepcopy(x["participant"][index]["motif"])
    del data["type"]
    data["fixedResidue"] = data["fixedResidue"]["count"]
    data["motif"] = data["value"]
    del data["value"]
    data["@source_id"] = participant_id
    data["@target_id"] = interaction_id
    return data

In [24]:
def generate_edge_frame(df, index):
    edges = pd.DataFrame(
        df.apply(lambda x: get_participation_data(x, index), axis=1).tolist())
    edges["@type"] = "participatesIn"
    edges = edges.set_index(["@source_id", "@target_id"])
    return edges

In [25]:
participation_edges = pd.concat([
    generate_edge_frame(interactions_df, 0),
    generate_edge_frame(interactions_df, 1)
])

In [26]:
encoded_edges = protein_df[["id", "encodedBy.id"]].rename(columns={
    "id": "@source_id",
    "encodedBy.id": "@target_id"
}).set_index(["@source_id", "@target_id"])
encoded_edges["@type"] = "encodedBy"

In [28]:
gene_df.sample(5)

Unnamed: 0,id,type,name,subject.type,subject.species.id,subject.species.label
235,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Avpr2,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus
456,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Slc22a18,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus
140,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",SV2B,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens
323,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",ACKR3,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens
76,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",CHRM2,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9913,Bos taurus


## Get additional gene data from Ensembl

In [18]:
records = {}
for gene in gene_df[["name", "subject.species.label"]].to_dict("records"):
    gene_name = gene["name"]
    species = gene["subject.species.label"].lower().replace(" ", "_")
    print(gene_name, species)
    if (gene_name, species) not in records:
        try:a
            records[(gene_name, species)] = get_gene_data(
                gene_name, species)
        except TypeError as e:
            print(e)
            print("Skipped: ", gene["name"])
#             break

PLPP1 homo_sapiens
Rom1 mus_musculus
Selp mus_musculus
Ntrk1 rattus_norvegicus
Aqp2 rattus_norvegicus
Chrm4 rattus_norvegicus
Mc5r rattus_norvegicus
ADRA1B homo_sapiens
Olr1571 rattus_norvegicus
SRC2 arabidopsis_thaliana
SLC1A4 homo_sapiens
Ldlr rattus_norvegicus
OPT2 arabidopsis_thaliana
Tnfrsf4 mus_musculus
Htr1f mus_musculus
AQP2 homo_sapiens
Mc5r mus_musculus
Chrm2 rattus_norvegicus
GCGR homo_sapiens
STP7 arabidopsis_thaliana
Ldlr mus_musculus
TACR2 homo_sapiens
Slc1a2 mus_musculus
Clcn5 rattus_norvegicus
Clcn3 mus_musculus
COMT homo_sapiens
AQP5 homo_sapiens
ADCY7 homo_sapiens
Slc5a2 rattus_norvegicus
Gcgr rattus_norvegicus
Epha2 mus_musculus
Coq8b mus_musculus
Abcg1 mus_musculus
AQP4 homo_sapiens
CHRM2 gallus_gallus
SLC1A2 homo_sapiens
Sdc3 mus_musculus
CLCN3 homo_sapiens
Adcy6 mus_musculus
Trpc1 mus_musculus
CCKBR homo_sapiens
Aqp5 rattus_norvegicus
Mip mus_musculus
Aqp4 mus_musculus
MC5R homo_sapiens
Asic3 rattus_norvegicus
RBOHF arabidopsis_thaliana
Slc5a10 mus_musculus
Slc39a

CLCN5 homo_sapiens
Slc1a5 mus_musculus
LPLAT1 arabidopsis_thaliana
MSL8 arabidopsis_thaliana
Slc16a13 rattus_norvegicus
Kcnk15 rattus_norvegicus
KCNH5 homo_sapiens
Plpp2 rattus_norvegicus
Slc12a9 rattus_norvegicus
Abcg2 rattus_norvegicus
Trpc1 rattus_norvegicus
KCNJ9 homo_sapiens
SLC1A5 bos_taurus
Kcnj9 rattus_norvegicus
Kcnh5 mus_musculus
Kcnk10 rattus_norvegicus
Kcnj3 mus_musculus
Kcnh5 rattus_norvegicus
KCNK2 homo_sapiens
KCNJ3 homo_sapiens
Kcnj6 mus_musculus
Kcnj3 rattus_norvegicus
Kcnj9 mus_musculus
KCNK10 homo_sapiens
Kcnk2 rattus_norvegicus
ABCG10 arabidopsis_thaliana
NHX1 arabidopsis_thaliana
RLP31 arabidopsis_thaliana
PERK6 arabidopsis_thaliana
ABCG22 arabidopsis_thaliana
At3g25290 arabidopsis_thaliana
ABCG21 arabidopsis_thaliana
SLC43A2 bos_taurus
Mst1r mus_musculus
AZG2 arabidopsis_thaliana
COR2 gallus_gallus
SLC5A10 bos_taurus
SERK1 arabidopsis_thaliana
slc2a12 danio_rerio
RLP44 arabidopsis_thaliana
DEK1 arabidopsis_thaliana
COR4 gallus_gallus
RBOHB arabidopsis_thaliana
IRT

In [19]:
for (name, species), v in records.items():
    v["name"] = name
    v["species"] = species.replace("_", " ").capitalize()

In [None]:
gene_df.sample(5)

In [31]:
gene_df.columns

Index(['id', 'type', 'name', 'subject.type', 'subject.species.id', 'species',
       'ensembl_id', 'xrefs', 'uniprot_ac', 'desctiption', 'prefLabel',
       'go_edges', 'label'],
      dtype='object')

In [32]:
gene_df

Unnamed: 0,id,type,name,subject.type,subject.species.id,species,ensembl_id,xrefs,uniprot_ac,desctiption,prefLabel,go_edges,label
0,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",PLPP1,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens,ENSG00000067113,"{'ArrayExpress': 'ENSG00000067113', 'EntrezGen...",D6REC3,phospholipid phosphatase 1 [Source:HGNC Symbol...,PLPP1,"[(involved_in, GO:0016311), (involved_in, GO:0...",PLPP1
1,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Rom1,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,ENSMUSG00000071648,"{'ArrayExpress': 'ENSMUSG00000071648', 'Entrez...",P32958,rod outer segment membrane protein 1 [Source:M...,Rom1,"[(is_active_in, GO:0005887), (located_in, GO:0...",Rom1
2,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Selp,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,ENSMUSG00000026580,"{'ArrayExpress': 'ENSMUSG00000026580', 'Entrez...",Q01102,"selectin, platelet [Source:MGI Symbol;Acc:MGI:...",Selp,"[(involved_in, GO:0034097), (enables, GO:00336...",Selp
3,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Ntrk1,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10116,Rattus norvegicus,ENSRNOG00000013953,"{'Reactome_gene': 'R-RNO-170984', 'RGD': '6201...",P35739,neurotrophic receptor tyrosine kinase 1 [Sourc...,Ntrk1,"[(enables, GO:0005515), (enables, GO:0005515),...",Ntrk1
4,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Aqp2,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10116,Rattus norvegicus,ENSRNOG00000054378,"{'Reactome_gene': 'R-RNO-382551', 'Uniprot_gn'...",,aquaporin 2 [Source:RGD Symbol;Acc:2142],Aqp2,,Aqp2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",TRPC5,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens,ENSG00000072315,"{'ArrayExpress': 'ENSG00000072315', 'EntrezGen...",Q9UL62,transient receptor potential cation channel su...,TRPC5,"[(enables, GO:0015279), (is_active_in, GO:0005...",TRPC5
602,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",TRPM6,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens,ENSG00000119121,"{'ArrayExpress': 'ENSG00000119121', 'EntrezGen...",Q9BX84,transient receptor potential cation channel su...,TRPM6,"[(enables, GO:0004674), (enables, GO:0005262),...",TRPM6
603,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",Trpc4,Subject,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,ENSMUSG00000027748,"{'ArrayExpress': 'ENSMUSG00000027748', 'Entrez...",Q0VB97,"transient receptor potential cation channel, s...",Trpc4,"[(enables, GO:0005216), (enables, GO:0005262),...",Trpc4
604,https://bbp.epfl.ch/neurosciencegraph/data/gen...,"[Gene, Entity]",CD200R1,Subject,http://purl.obolibrary.org/obo/NCBITaxon_9606,Homo sapiens,ENSG00000163606,"{'ArrayExpress': 'ENSG00000163606', 'EntrezGen...",H9KV32,CD200 receptor 1 [Source:HGNC Symbol;Acc:HGNC:...,CD200R1,"[(enables, GO:0038023), (involved_in, GO:01500...",CD200R1


In [29]:
new_gene_df = pd.DataFrame(records.values())
new_gene_df["label"] = new_gene_df["prefLabel"]
gene_df = gene_df.rename(columns={"subject.species.label": "species"})
gene_df = gene_df.merge(
    new_gene_df, on=["name", "species"])

In [21]:
gene_nodes = gene_df.drop(
    columns=[
        "type", "subject.type",
        "subject.species.id", "go_edges"
    ])
gene_nodes["@type"] = "GENE"
gene_nodes = gene_nodes.rename(
    columns={"id": "@id"}).set_index("@id")

## Extract GO terms (only cellular compartment) and edges

In [22]:
go = rdflib.Graph()
go.parse("../../ontologies/external/go.ttl", format="turtle")

<Graph identifier=N873ef68fbe404681a6dddd9c984ac95c (<class 'rdflib.graph.Graph'>)>

In [23]:
go_terms_to_include = {}
for g, record in records.items():
    if "go_edges" in record:
        for _, t in record["go_edges"]:
            if t not in go_terms_to_include:
                for s in go.subjects(
                        rdflib.URIRef("http://www.geneontology.org/formats/oboInOwl#id"),
                        rdflib.Literal(t, datatype=rdflib.XSD.string)):
                    go_terms_to_include[t] = s
                    break

In [24]:
selected_go = rdflib.Graph()
for t in go_terms_to_include.values():
    subontology = subontology_from_term(go, t, top_down=False, closed=True)
    selected_go += subontology

## Create a PG from selected GO terms + interactions + ontology classes

In [209]:
frame = PandasPGFrame.from_ontology(rdf_graph=selected_go, remove_prop_uris=True)
frame.remove_isolated_nodes()
frame.rename_node_properties({"id": "go_id"})

Select only 'cellular component'

In [210]:
frame = frame.subgraph(
    nodes=go_frame._nodes[
        go_frame._nodes["has_obo_namespace"] == "cellular_component"].index)

In [211]:
frame._nodes["@type"] = "GENE_ONTOLOGY_CLASS"

### Add nodes

Add nodes corresponding to BMO concepts

In [212]:
frame.add_nodes([
    "Gene",
    "Protein",
    "ProteinProteinInteraction",
    "BiochemicalReaction"
])
frame._nodes.loc["Gene", "@type"] = "GENE"
frame._nodes.loc["Protein", "@type"] = "MOLECULE"
frame._nodes.loc["ProteinProteinInteraction", "@type"] = "BIOCHEMICAL_REACTION"
frame._nodes.loc["BiochemicalReaction", "@type"] = "BIOCHEMICAL_REACTION"

Add interaction / protein / gene nodes. 

In [213]:
interaction_nodes = interactions_df[["id", "name"]]
interaction_nodes["@type"] = "BIOCHEMICAL_REACTION"
interaction_nodes = interaction_nodes.rename(
    columns={"id": "@id"}).set_index("@id")

protein_nodes = protein_df[[
    "id", "identifier", "name",
    "structureAvailable",
    "subject.species.id", "subject.species.label"]]
protein_nodes["@type"] = "MOLECULE"
protein_nodes = protein_nodes.rename(columns={
    "id": "@id",
    "subject.species.id": "subject_species_id",
    "subject.species.label": "subject_species_label"}).set_index("@id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [214]:
frame._nodes = pd.concat([
    frame._nodes,
    interaction_nodes,
    protein_nodes,
    gene_nodes])

In [215]:
for c in frame._nodes.columns:
    frame.node_prop_as_category(c)

### Add edges

Add edges between BMO concepts

In [217]:
frame.add_edges([
    ("Protein", "Gene"),
    ("Protein", "ProteinProteinInteraction"),
    ("ProteinProteinInteraction", "BiochemicalReaction")
])
frame._edges.loc[("Protein", "Gene"), "@type"] = "encodedBy"
frame._edges.loc[("Protein", "ProteinProteinInteraction"), "@type"] = "participatesIn"
frame._edges.loc[("ProteinProteinInteraction", "BiochemicalReaction"), "@type"] = "IS_SUBCLASS_OF"

Add edges between proteins/genes and interactions

In [218]:
frame._edges = pd.concat(
    [frame._edges, encoded_edges, participation_edges])

for c in frame._edges.columns:
    if c not in ["cost", "errorRate", "motif"]:
        frame.edge_prop_as_category(c)
frame.edge_prop_as_text("motif")
frame.edge_prop_as_numeric("cost")
frame.edge_prop_as_numeric("errorRate")

Add edges to ontology concepts

In [219]:
interaction_instance_edges = pd.DataFrame(
    interaction_nodes.index).rename(columns={"@id": "@source_id"})
interaction_instance_edges["@target_id"] = "ProteinProteinInteraction"
interaction_instance_edges["@type"] = "IS_INSTANCE_OF"
interaction_instance_edges = interaction_instance_edges.set_index(["@source_id", "@target_id"])

In [220]:
protein_instance_edges = pd.DataFrame(
    protein_nodes.index).rename(columns={"@id": "@source_id"})
protein_instance_edges["@target_id"] = "Protein"
protein_instance_edges["@type"] = "IS_INSTANCE_OF"
protein_instance_edges = protein_instance_edges.set_index(["@source_id", "@target_id"])

In [221]:
gene_instance_edges = pd.DataFrame(
    gene_nodes.index).rename(columns={"@id": "@source_id"})
gene_instance_edges["@target_id"] = "Gene"
gene_instance_edges["@type"] = "IS_INSTANCE_OF"
gene_instance_edges = gene_instance_edges.set_index(["@source_id", "@target_id"])

In [222]:
instance_edges = pd.concat([
    interaction_instance_edges,
    protein_instance_edges,
    gene_instance_edges
])

In [223]:
frame._edges = pd.concat(
    [frame._edges, instance_edges])

Add edges from Genes to GO terms

In [224]:
go_edges = gene_df[["id", "go_edges"]]

In [225]:
# Select only edges of type 'located_in'
go_edges["go_edges"] = go_edges["go_edges"].apply(
    lambda x:
        [el for el in x if el[0] == "located_in"]
        if isinstance(x, list) else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [226]:
frame.add_edges(go_edges.keys())
for e, types in go_edges.items():
    frame._edges.loc[e, "@type"] = types

frame.rename_node_properties({
    p: p.replace(" ", "_") for p in frame.node_properties()
})

In [227]:
located_in_edges = defaultdict(set)
for row in go_edges.to_dict("records"):
    source  = row["id"]
    if isinstance(row["go_edges"], list):
        for rel, go_term in row["go_edges"]:
            try:
                target = frame._nodes[frame._nodes["go_id"] == go_term].index[0]
                located_in_edges[(source, target)].add(rel.replace(" ", "_"))
            except IndexError:
                pass

In [228]:
frame.add_edges(located_in_edges.keys())
for e, types in located_in_edges.items():
    frame._edges.loc[e, "@type"] = types

frame.rename_node_properties({
    p: p.replace(" ", "_") for p in frame.node_properties()
})

In [305]:
gene_compartment_data = pd.DataFrame(
    [(s, t) for s, t in located_in_edges.keys()],
    columns=["geneId", "cellularComponent.label"])

In [306]:
compartment_data = frame._nodes.loc[
    gene_compartment_data["cellularComponent.label"].unique()].dropna(
    axis=1).drop(columns=["@type", "label"]).reset_index().rename(columns={"@id": "cellularComponent.label"})

In [310]:
gene_compartment_data["cellularComponent.id"] = gene_compartment_data["go_id"].apply(
    lambda x: "http://purl.obolibrary.org/obo/GO_" + x.split(":")[1])

In [312]:
gene_compartment_data = gene_compartment_data.rename(columns={
    "go_id": "cellularComponent.goId",
    "definition":  "cellularComponent.definition",
    "has_obo_namespace":  "cellularComponent.hasOboNamespace",
})

In [314]:
gene_compartment_data.to_csv("../data/gene_compartments.csv", index=None)

In [317]:
gene_nodes.reset_index().drop(columns=["@type"]).to_csv(
    "../data/gene_ensembl_meta_data.csv", index=None)

In [318]:
gene_nodes

Unnamed: 0_level_0,name,species,ensembl_id,xrefs,uniprot_ac,desctiption,prefLabel,label,@type
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://bbp.epfl.ch/neurosciencegraph/data/genes/d6dec71b-4347-4664-b694-885dfd565c52,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/2729cbcb-5569-418b-91de-c5224c3192be,Htr1f,Rattus norvegicus,ENSRNOG00000000716,"{'Uniprot_gn': 'G3V626', 'RGD': '71083', 'Reac...",G3V626,5-hydroxytryptamine receptor 1F [Source:RGD Sy...,Htr1f,Htr1f,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/65feb3f2-80fa-4ec4-b99b-ee1ed3b88930,HTR1F,Homo sapiens,ENSG00000179097,"{'ArrayExpress': 'ENSG00000179097', 'EntrezGen...",P30939,5-hydroxytryptamine receptor 1F [Source:HGNC S...,HTR1F,HTR1F,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/db0e0fb7-a2fe-4f21-b0de-73d2d448200c,MIP,Homo sapiens,ENSG00000135517,"{'ArrayExpress': 'ENSG00000135517', 'EntrezGen...",P30301,major intrinsic protein of lens fiber [Source:...,MIP,MIP,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/3fb46982-b3c3-4998-bc6d-ca22e09e9bea,ASIC1,Homo sapiens,ENSG00000110881,"{'ArrayExpress': 'ENSG00000110881', 'EntrezGen...",F8VSK4,acid sensing ion channel subunit 1 [Source:HGN...,ASIC1,ASIC1,GENE
...,...,...,...,...,...,...,...,...,...
https://bbp.epfl.ch/neurosciencegraph/data/genes/98d04429-b003-4fc4-9e66-817546fd2f5f,TRPC5,Homo sapiens,ENSG00000072315,"{'ArrayExpress': 'ENSG00000072315', 'EntrezGen...",Q9UL62,transient receptor potential cation channel su...,TRPC5,TRPC5,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/062bf34d-a594-4568-a887-7f966f5c6420,TRPM6,Homo sapiens,ENSG00000119121,"{'ArrayExpress': 'ENSG00000119121', 'EntrezGen...",Q9BX84,transient receptor potential cation channel su...,TRPM6,TRPM6,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/68685b46-994d-4290-abbf-85c56b6f4b40,Trpc4,Mus musculus,ENSMUSG00000027748,"{'ArrayExpress': 'ENSMUSG00000027748', 'Entrez...",Q0VB97,"transient receptor potential cation channel, s...",Trpc4,Trpc4,GENE
https://bbp.epfl.ch/neurosciencegraph/data/genes/f7aaddd8-22a4-4b5f-865f-b6909226f249,CD200R1,Homo sapiens,ENSG00000163606,"{'ArrayExpress': 'ENSG00000163606', 'EntrezGen...",H9KV32,CD200 receptor 1 [Source:HGNC Symbol;Acc:HGNC:...,CD200R1,CD200R1,GENE


## Export created PG to Neo4j

In [243]:
for c in frame._nodes.columns:
    frame._nodes.loc[frame._nodes[c].apply(lambda x: x is None), c] = np.nan

In [244]:
pgframe_to_neo4j(
    frame,
    uri="bolt://localhost:7687",
    username="neo4j",
    password="admin",
    node_types_as_labels=True,
    edge_types_as_labels=True,
    batch_size=10)

<bluegraph.backends.neo4j.io.Neo4jGraphView at 0x7fdf247ec7d0>

In [250]:
pd.DataFrame(located_in_edges.items())

Unnamed: 0,0,1
0,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
1,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
2,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
3,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
4,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
...,...,...
1702,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
1703,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
1704,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}
1705,(https://bbp.epfl.ch/neurosciencegraph/data/ge...,{located_in}


In [261]:
compartment_data = gene_nodes.reset_index().merge(
    com,
    on="@id")

Unnamed: 0,@id,cellularComponent.label,name,species,ensembl_id,xrefs,uniprot_ac,desctiption,prefLabel,label,@type
0,https://bbp.epfl.ch/neurosciencegraph/data/gen...,integral component of membrane,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
1,https://bbp.epfl.ch/neurosciencegraph/data/gen...,membrane,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
2,https://bbp.epfl.ch/neurosciencegraph/data/gen...,plasma membrane,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
3,https://bbp.epfl.ch/neurosciencegraph/data/gen...,Golgi apparatus,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
4,https://bbp.epfl.ch/neurosciencegraph/data/gen...,integral component of Golgi membrane,MYMK,Homo sapiens,ENSG00000187616,"{'ArrayExpress': 'ENSG00000187616', 'EntrezGen...",A6NI61,"myomaker, myoblast fusion factor [Source:HGNC ...",MYMK,MYMK,GENE
...,...,...,...,...,...,...,...,...,...,...,...
1702,https://bbp.epfl.ch/neurosciencegraph/data/gen...,cell surface,Trpc4,Mus musculus,ENSMUSG00000027748,"{'ArrayExpress': 'ENSMUSG00000027748', 'Entrez...",Q0VB97,"transient receptor potential cation channel, s...",Trpc4,Trpc4,GENE
1703,https://bbp.epfl.ch/neurosciencegraph/data/gen...,plasma membrane,Trpc4,Mus musculus,ENSMUSG00000027748,"{'ArrayExpress': 'ENSMUSG00000027748', 'Entrez...",Q0VB97,"transient receptor potential cation channel, s...",Trpc4,Trpc4,GENE
1704,https://bbp.epfl.ch/neurosciencegraph/data/gen...,cell-cell junction,Trpc4,Mus musculus,ENSMUSG00000027748,"{'ArrayExpress': 'ENSMUSG00000027748', 'Entrez...",Q0VB97,"transient receptor potential cation channel, s...",Trpc4,Trpc4,GENE
1705,https://bbp.epfl.ch/neurosciencegraph/data/gen...,membrane,Trpc7,Mus musculus,ENSMUSG00000021541,"{'ArrayExpress': 'ENSMUSG00000021541', 'Entrez...",G3UWT1,"transient receptor potential cation channel, s...",Trpc7,Trpc7,GENE


## Extract interactions from IntAct

In [156]:
import requests

In [157]:
URL = "https://www.ebi.ac.uk/intact/ws/interaction"

In [158]:
INTACT_FIELDS = [
    "uniqueIdA",
    "uniqueIdB",
    "speciesA",
    "speciesB",
    "featuresA",
    "featuresB",
    "confidenceValues",
    "type",
    "negative",
    "allAnnotations",
]

In [159]:
print("Unique genes: ", len(gene_df["name"].unique()))

Unique genes:  516


In [312]:
def search_intact_interactions(query, species=None):
    
    def _fetch_page(page=0):
        request_url = f"{URL}/findInteractionWithFacet"
        params={
            "query": query,
            "page": page
        }
        if species is not None:
            params["interactorSpeciesFilter"] = species
        r = requests.post(request_url, params=params)
        data = r.json()
        return data["data"]
    
    data = _fetch_page()
    n_pages = data["totalPages"]
    print(f"Fetching {n_pages} pages for '{query}' (species '{species}')...")
    if n_pages > 150:
        print("Skipping...")
        return {}
        
    interactions = {}
    for el in data["content"]:
        interactions[el["ac"]] = el
    
    for i in range(1, n_pages):
        page_data =  _fetch_page(i)["content"]
        for el in page_data:
            interactions[el["ac"]] = el

    return interactions

In [313]:
SKIP = ["App", "FER", "APP", "Tl", "Alk", "5"]

In [314]:
intact_interactions = {}
all_species = ["Homo sapiens", "Mus musculus", "Rattus norvegicus"]
for gene_name in gene_df["name"].unique():
    if gene_name not in SKIP:
        for species in all_species:
            intact_interactions.update(search_intact_interactions(gene_name, species))

Fetching 4 pages for 'PLPP1' (species 'Homo sapiens')...
Fetching 1 pages for 'PLPP1' (species 'Mus musculus')...
Fetching 0 pages for 'PLPP1' (species 'Rattus norvegicus')...
Fetching 2 pages for 'Rom1' (species 'Homo sapiens')...
Fetching 0 pages for 'Rom1' (species 'Mus musculus')...
Fetching 0 pages for 'Rom1' (species 'Rattus norvegicus')...
Fetching 5 pages for 'Selp' (species 'Homo sapiens')...
Fetching 1 pages for 'Selp' (species 'Mus musculus')...
Fetching 0 pages for 'Selp' (species 'Rattus norvegicus')...
Fetching 7 pages for 'Ntrk1' (species 'Homo sapiens')...
Fetching 2 pages for 'Ntrk1' (species 'Mus musculus')...
Fetching 0 pages for 'Ntrk1' (species 'Rattus norvegicus')...
Fetching 103 pages for 'Aqp2' (species 'Homo sapiens')...
Fetching 1 pages for 'Aqp2' (species 'Mus musculus')...
Fetching 0 pages for 'Aqp2' (species 'Rattus norvegicus')...
Fetching 9 pages for 'Chrm4' (species 'Homo sapiens')...
Fetching 1 pages for 'Chrm4' (species 'Mus musculus')...
Fetching 0 pa

Fetching 3 pages for 'SLC43A2' (species 'Homo sapiens')...
Fetching 0 pages for 'SLC43A2' (species 'Mus musculus')...
Fetching 0 pages for 'SLC43A2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'UNC5A' (species 'Homo sapiens')...
Fetching 1 pages for 'UNC5A' (species 'Mus musculus')...
Fetching 0 pages for 'UNC5A' (species 'Rattus norvegicus')...
Fetching 0 pages for 'MUC3B' (species 'Homo sapiens')...
Fetching 0 pages for 'MUC3B' (species 'Mus musculus')...
Fetching 0 pages for 'MUC3B' (species 'Rattus norvegicus')...
Fetching 1 pages for 'GPR135' (species 'Homo sapiens')...
Fetching 0 pages for 'GPR135' (species 'Mus musculus')...
Fetching 0 pages for 'GPR135' (species 'Rattus norvegicus')...
Fetching 3 pages for 'CD300A' (species 'Homo sapiens')...
Fetching 0 pages for 'CD300A' (species 'Mus musculus')...
Fetching 0 pages for 'CD300A' (species 'Rattus norvegicus')...
Fetching 2 pages for 'HCST' (species 'Homo sapiens')...
Fetching 1 pages for 'HCST' (species 'Mus musculus')

Fetching 8 pages for 'Itpr3' (species 'Homo sapiens')...
Fetching 2 pages for 'Itpr3' (species 'Mus musculus')...
Fetching 0 pages for 'Itpr3' (species 'Rattus norvegicus')...
Fetching 9 pages for 'Bace2' (species 'Homo sapiens')...
Fetching 1 pages for 'Bace2' (species 'Mus musculus')...
Fetching 0 pages for 'Bace2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'SLC26A9' (species 'Homo sapiens')...
Fetching 1 pages for 'SLC26A9' (species 'Mus musculus')...
Fetching 0 pages for 'SLC26A9' (species 'Rattus norvegicus')...
Fetching 2 pages for 'Avpr2' (species 'Homo sapiens')...
Fetching 0 pages for 'Avpr2' (species 'Mus musculus')...
Fetching 0 pages for 'Avpr2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Parm1' (species 'Homo sapiens')...
Fetching 0 pages for 'Parm1' (species 'Mus musculus')...
Fetching 0 pages for 'Parm1' (species 'Rattus norvegicus')...
Fetching 2 pages for 'AVPR2' (species 'Homo sapiens')...
Fetching 0 pages for 'AVPR2' (species 'Mus musculus')...


Fetching 0 pages for 'KCNK15' (species 'Mus musculus')...
Fetching 0 pages for 'KCNK15' (species 'Rattus norvegicus')...
Fetching 1 pages for 'ASIC2' (species 'Homo sapiens')...
Fetching 1 pages for 'ASIC2' (species 'Mus musculus')...
Fetching 0 pages for 'ASIC2' (species 'Rattus norvegicus')...
Fetching 2 pages for 'Ttyh2' (species 'Homo sapiens')...
Fetching 0 pages for 'Ttyh2' (species 'Mus musculus')...
Fetching 0 pages for 'Ttyh2' (species 'Rattus norvegicus')...
Fetching 64 pages for 'Erbb3' (species 'Homo sapiens')...
Fetching 2 pages for 'Erbb3' (species 'Mus musculus')...
Fetching 0 pages for 'Erbb3' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Asic2' (species 'Homo sapiens')...
Fetching 1 pages for 'Asic2' (species 'Mus musculus')...
Fetching 0 pages for 'Asic2' (species 'Rattus norvegicus')...
Fetching 3 pages for 'ICAM4' (species 'Homo sapiens')...
Fetching 0 pages for 'ICAM4' (species 'Mus musculus')...
Fetching 0 pages for 'ICAM4' (species 'Rattus norvegicus')..

Fetching 4 pages for 'Plpp1' (species 'Homo sapiens')...
Fetching 1 pages for 'Plpp1' (species 'Mus musculus')...
Fetching 0 pages for 'Plpp1' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Adcy1' (species 'Homo sapiens')...
Fetching 1 pages for 'Adcy1' (species 'Mus musculus')...
Fetching 0 pages for 'Adcy1' (species 'Rattus norvegicus')...
Fetching 10 pages for 'SCAMP1' (species 'Homo sapiens')...
Fetching 1 pages for 'SCAMP1' (species 'Mus musculus')...
Fetching 0 pages for 'SCAMP1' (species 'Rattus norvegicus')...
Fetching 0 pages for 'GPR31' (species 'Homo sapiens')...
Fetching 0 pages for 'GPR31' (species 'Mus musculus')...
Fetching 0 pages for 'GPR31' (species 'Rattus norvegicus')...
Fetching 3 pages for 'PLPP2' (species 'Homo sapiens')...
Fetching 1 pages for 'PLPP2' (species 'Mus musculus')...
Fetching 0 pages for 'PLPP2' (species 'Rattus norvegicus')...
Fetching 2 pages for 'Plxnb3' (species 'Homo sapiens')...
Fetching 1 pages for 'Plxnb3' (species 'Mus musculus')...


Fetching 0 pages for 'Dscam' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Hrh2' (species 'Homo sapiens')...
Fetching 0 pages for 'Hrh2' (species 'Mus musculus')...
Fetching 0 pages for 'Hrh2' (species 'Rattus norvegicus')...
Fetching 6 pages for 'Hcn4' (species 'Homo sapiens')...
Fetching 1 pages for 'Hcn4' (species 'Mus musculus')...
Fetching 0 pages for 'Hcn4' (species 'Rattus norvegicus')...
Fetching 0 pages for 'Chrne' (species 'Homo sapiens')...
Fetching 0 pages for 'Chrne' (species 'Mus musculus')...
Fetching 0 pages for 'Chrne' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Hcn1' (species 'Homo sapiens')...
Fetching 9 pages for 'Hcn1' (species 'Mus musculus')...
Fetching 0 pages for 'Hcn1' (species 'Rattus norvegicus')...
Fetching 1 pages for 'HCN3' (species 'Homo sapiens')...
Fetching 1 pages for 'HCN3' (species 'Mus musculus')...
Fetching 0 pages for 'HCN3' (species 'Rattus norvegicus')...
Fetching 5 pages for 'Aqp8' (species 'Homo sapiens')...
Fetching 0 pag

Fetching 0 pages for 'NHX8' (species 'Homo sapiens')...
Fetching 0 pages for 'NHX8' (species 'Mus musculus')...
Fetching 0 pages for 'NHX8' (species 'Rattus norvegicus')...
Fetching 1 pages for 'Ccr7' (species 'Homo sapiens')...
Fetching 1 pages for 'Ccr7' (species 'Mus musculus')...
Fetching 0 pages for 'Ccr7' (species 'Rattus norvegicus')...
Fetching 0 pages for 'CHRNE' (species 'Homo sapiens')...
Fetching 0 pages for 'CHRNE' (species 'Mus musculus')...
Fetching 0 pages for 'CHRNE' (species 'Rattus norvegicus')...
Fetching 0 pages for 'OPT4' (species 'Homo sapiens')...
Fetching 0 pages for 'OPT4' (species 'Mus musculus')...
Fetching 0 pages for 'OPT4' (species 'Rattus norvegicus')...
Fetching 4 pages for 'ABCG2' (species 'Homo sapiens')...
Fetching 2 pages for 'ABCG2' (species 'Mus musculus')...
Fetching 0 pages for 'ABCG2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'OR4A16' (species 'Homo sapiens')...
Fetching 0 pages for 'OR4A16' (species 'Mus musculus')...
Fetching 0 pa

Fetching 0 pages for 'SERK4' (species 'Rattus norvegicus')...
Fetching 10 pages for 'SDC3' (species 'Homo sapiens')...
Fetching 1 pages for 'SDC3' (species 'Mus musculus')...
Fetching 0 pages for 'SDC3' (species 'Rattus norvegicus')...
Fetching 0 pages for 'mymk' (species 'Homo sapiens')...
Fetching 0 pages for 'mymk' (species 'Mus musculus')...
Fetching 0 pages for 'mymk' (species 'Rattus norvegicus')...
Fetching 8 pages for 'Slc22a2' (species 'Homo sapiens')...
Fetching 1 pages for 'Slc22a2' (species 'Mus musculus')...
Fetching 0 pages for 'Slc22a2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'SLC43A1' (species 'Homo sapiens')...
Fetching 0 pages for 'SLC43A1' (species 'Mus musculus')...
Fetching 0 pages for 'SLC43A1' (species 'Rattus norvegicus')...
Fetching 2 pages for 'CLCN5' (species 'Homo sapiens')...
Fetching 1 pages for 'CLCN5' (species 'Mus musculus')...
Fetching 0 pages for 'CLCN5' (species 'Rattus norvegicus')...
Fetching 13 pages for 'Slc1a5' (species 'Homo sapie

Fetching 0 pages for 'COR6' (species 'Mus musculus')...
Fetching 0 pages for 'COR6' (species 'Rattus norvegicus')...
Fetching 0 pages for 'At3g13620' (species 'Homo sapiens')...
Fetching 0 pages for 'At3g13620' (species 'Mus musculus')...
Fetching 0 pages for 'At3g13620' (species 'Rattus norvegicus')...
Fetching 1 pages for 'ZIP7' (species 'Homo sapiens')...
Fetching 0 pages for 'ZIP7' (species 'Mus musculus')...
Fetching 0 pages for 'ZIP7' (species 'Rattus norvegicus')...
Fetching 0 pages for 'NHX7' (species 'Homo sapiens')...
Fetching 0 pages for 'NHX7' (species 'Mus musculus')...
Fetching 0 pages for 'NHX7' (species 'Rattus norvegicus')...
Fetching 0 pages for 'WAKL13' (species 'Homo sapiens')...
Fetching 0 pages for 'WAKL13' (species 'Mus musculus')...
Fetching 0 pages for 'WAKL13' (species 'Rattus norvegicus')...
Fetching 6 pages for 'Slc29a2' (species 'Homo sapiens')...
Fetching 0 pages for 'Slc29a2' (species 'Mus musculus')...
Fetching 0 pages for 'Slc29a2' (species 'Rattus norv

Fetching 2 pages for 'ADCY2' (species 'Homo sapiens')...
Fetching 0 pages for 'ADCY2' (species 'Mus musculus')...
Fetching 0 pages for 'ADCY2' (species 'Rattus norvegicus')...
Fetching 1 pages for 'ATP10B' (species 'Homo sapiens')...
Fetching 0 pages for 'ATP10B' (species 'Mus musculus')...
Fetching 0 pages for 'ATP10B' (species 'Rattus norvegicus')...
Fetching 12 pages for 'LRP4' (species 'Homo sapiens')...
Fetching 2 pages for 'LRP4' (species 'Mus musculus')...
Fetching 0 pages for 'LRP4' (species 'Rattus norvegicus')...
Fetching 8 pages for 'THBD' (species 'Homo sapiens')...
Fetching 1 pages for 'THBD' (species 'Mus musculus')...
Fetching 0 pages for 'THBD' (species 'Rattus norvegicus')...
Fetching 0 pages for 'Slc28a2' (species 'Homo sapiens')...
Fetching 0 pages for 'Slc28a2' (species 'Mus musculus')...
Fetching 0 pages for 'Slc28a2' (species 'Rattus norvegicus')...
Fetching 2 pages for 'TSPAN6' (species 'Homo sapiens')...
Fetching 0 pages for 'TSPAN6' (species 'Mus musculus')...


Fetching 0 pages for 'NPF2.9' (species 'Homo sapiens')...
Fetching 0 pages for 'NPF2.9' (species 'Mus musculus')...
Fetching 0 pages for 'NPF2.9' (species 'Rattus norvegicus')...
Fetching 0 pages for 'At2g23680' (species 'Homo sapiens')...
Fetching 0 pages for 'At2g23680' (species 'Mus musculus')...
Fetching 0 pages for 'At2g23680' (species 'Rattus norvegicus')...
Fetching 0 pages for 'At3g16300' (species 'Homo sapiens')...
Fetching 0 pages for 'At3g16300' (species 'Mus musculus')...
Fetching 0 pages for 'At3g16300' (species 'Rattus norvegicus')...
Fetching 0 pages for 'SLAH2' (species 'Homo sapiens')...
Fetching 0 pages for 'SLAH2' (species 'Mus musculus')...
Fetching 0 pages for 'SLAH2' (species 'Rattus norvegicus')...
Fetching 0 pages for 'PAT22' (species 'Homo sapiens')...
Fetching 0 pages for 'PAT22' (species 'Mus musculus')...
Fetching 0 pages for 'PAT22' (species 'Rattus norvegicus')...
Fetching 11 pages for 'OCT1' (species 'Homo sapiens')...
Fetching 0 pages for 'OCT1' (species

In [315]:
len(intact_interactions)

13611

In [318]:
import pickle
with open("../data/intact_interactions_all.pkl", "wb") as f:
    pickle.dump(intact_interactions, f)

In [319]:
interactions = pd.DataFrame(intact_interactions.values())

In [221]:
interactions.to_csv("../data/intact_interactions_all.csv", index=None)

In [320]:
membrane_prot_interactions = interactions[interactions.apply(
    lambda x: x.uniqueIdA in gene_df["uniprot_ac"].unique() and\
              x.uniqueIdB in gene_df["uniprot_ac"].unique(), axis=1)]

In [321]:
membrane_prot_interactions

Unnamed: 0,ac,binaryInteractionId,documentType,idA,idB,acA,acB,uniqueIdA,uniqueIdB,moleculeA,...,releaseDate,intraTaxIdStyled,taxIdAStyled,taxIdBStyled,typeMIIdentifierStyled,typeMIAStyled,typeMIBStyled,hostOrganismTaxIdStyled,affectedByMutationStyled,searchChildInteractors
1221,EBI-21284925,3897665,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1563812513148,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1222,EBI-21284945,3897697,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1563812513148,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1223,EBI-21284190,3896522,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1557403194233,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1226,EBI-21298877,3911670,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1563812506845,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,
1227,EBI-21298893,3911704,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1563812506845,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,
1228,EBI-21298937,3911718,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,1563812506845,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1229,EBI-21288674,3902699,interaction,P61073 (uniprotkb),P35368 (uniprotkb),EBI-489411,EBI-490017,P61073,P35368,CXCR4,...,1563812513148,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:0914__association__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1235,EBI-21283831,3896239,interaction,P35368 (uniprotkb),P25106 (uniprotkb),EBI-490017,EBI-1965291,P35368,P25106,ADRA1B,...,1557403194233,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,
1236,EBI-21283912,3896359,interaction,P35368 (uniprotkb),P25106 (uniprotkb),EBI-490017,EBI-1965291,P35368,P25106,ADRA1B,...,1557403194233,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,
1238,EBI-21282722,3894894,interaction,P35368 (uniprotkb),P35368 (uniprotkb),EBI-490017,EBI-490017,P35368,P35368,ADRA1B,...,1563812513834,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:0915__physical association__#7bccc4,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens transformed primary embryon...,none__false__#7e8389,


In [322]:
df = membrane_prot_interactions.merge(
    gene_nodes.reset_index()[["name", "uniprot_ac"]].rename(
        columns={"uniprot_ac": "uniqueIdA"}),
    on="uniqueIdA", how="left").rename(columns={"name": "nameA"}).merge(
        gene_nodes.reset_index()[["name", "uniprot_ac"]].rename(
         columns={"uniprot_ac": "uniqueIdB"}),
        on="uniqueIdB").rename(columns={"name": "nameB"})

In [325]:
df

Unnamed: 0,ac,binaryInteractionId,documentType,idA,idB,acA,acB,uniqueIdA,uniqueIdB,moleculeA,...,taxIdAStyled,taxIdBStyled,typeMIIdentifierStyled,typeMIAStyled,typeMIBStyled,hostOrganismTaxIdStyled,affectedByMutationStyled,searchChildInteractors,nameA,nameB
0,EBI-21284925,3897665,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,,ADRA1B,CXCR4
1,EBI-21284945,3897697,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,,ADRA1B,CXCR4
2,EBI-21284190,3896522,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,,ADRA1B,CXCR4
3,EBI-21298877,3911670,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,,ADRA1B,CXCR4
4,EBI-21298893,3911704,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,,ADRA1B,CXCR4
5,EBI-21298937,3911718,interaction,P35368 (uniprotkb),P61073 (uniprotkb),EBI-490017,EBI-489411,P35368,P61073,ADRA1B,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens__#335e94,none__false__#7e8389,,ADRA1B,CXCR4
6,EBI-21292905,3903812,interaction,P25106 (uniprotkb),P61073 (uniprotkb),EBI-1965291,EBI-489411,P25106,P61073,ACKR3,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:2364__proximity__#bae4bc,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens 293 cells transformed with ...,none__false__#7e8389,,ACKR3,CXCR4
7,EBI-21294813,3904570,interaction,P25106 (uniprotkb),P61073 (uniprotkb),EBI-1965291,EBI-489411,P25106,P61073,ACKR3,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:0915__physical association__#7bccc4,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens transformed primary embryon...,none__false__#7e8389,,ACKR3,CXCR4
8,EBI-21294836,3904586,interaction,P25106 (uniprotkb),P61073 (uniprotkb),EBI-1965291,EBI-489411,P25106,P61073,ACKR3,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:0915__physical association__#7bccc4,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,9606__Homo sapiens transformed primary embryon...,none__false__#7e8389,,ACKR3,CXCR4
9,EBI-21294855,3904609,interaction,P25106 (uniprotkb),P61073 (uniprotkb),EBI-1965291,EBI-489411,P25106,P61073,ACKR3,...,9606__Homo sapiens__#335e94,9606__Homo sapiens__#335e94,MI:0403__colocalization__#d8d8d8,MI:0326__protein__ELLIPSE,MI:0326__protein__ELLIPSE,10090__Mus musculus albino neuroblastoma cell ...,none__false__#7e8389,,ACKR3,CXCR4


In [326]:
df.to_csv("../data/intact_membrate_ppi.csv", index=None)

In [3]:
import itables
import pandas as pd

In [4]:
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

<IPython.core.display.Javascript object>

In [48]:
intact_df = pd.read_csv("../data/intact_membrate_ppi.csv")

In [29]:
interactions_df["pA"] = interactions_df["participant"].apply(
    lambda x: x[0]["protein"]["id"])
interactions_df["pB"] = interactions_df["participant"].apply(
    lambda x: x[1]["protein"]["id"])

In [42]:
df = interactions_df[["pA", "pB"]].merge(
    protein_df.rename(columns={"id": "pA"}),
    on="pA",
    how="left").merge(
        protein_df.rename(columns={"id": "pB"}),
        on="pB",
        how="left"
    )

In [56]:
existing = [list(el) for el in intact_df[["uniqueIdA", "uniqueIdB"]].values]

In [58]:
for k, v in df[["identifier_x", "identifier_y"]].values:
    if (k, v) in existing:
        print(k, v)
    if (v, k) in existing:
        print(v, k)

In [59]:
intact_df



ac,binaryInteractionId,documentType,idA,idB,acA,acB,uniqueIdA,uniqueIdB,moleculeA,taxIdAStyled,taxIdBStyled,typeMIIdentifierStyled,typeMIAStyled,typeMIBStyled,hostOrganismTaxIdStyled,affectedByMutationStyled,searchChildInteractors,nameA,nameB
