# **CTNNB1 Exploration**

**Exploring how specific proteins out of a statistically compiled list of 25 proteins relate to CTNNB1 through INDRA statements,
exploring Wnt/B-catenin pathway membership, determining if any of the proteins belong to the same protein family/complex as CTNNB1, and using INDRA discrete gene list analysis results**

## Finding Unique Proteins
Which proteins are mentioned in list of top 25 proteins that are not listed in the paper?

In [1]:
# imports
from indra_cogex.client import Neo4jClient
import json
client = Neo4jClient()
from indra.assemblers.html import HtmlAssembler
import json
from indra.statements import *
import pandas as pd
from indra_cogex.client import *

INFO: [2024-07-10 02:12:13] numexpr.utils - NumExpr defaulting to 8 threads.
INFO: [2024-07-10 02:12:14] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:14] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


In [88]:
def get_unique_proteins():
    """
    Parameters
    ----------
    none

    Returns
    -------
    paper_proteins : list that contains names of proteins from paper
    top_25: list that contains protien names of top 25 protiens given
    bcat_pathway: list that contains given protein names involved in the pathway from google search
    unique: list that contains proteins that are in the top_25 list but not paper_protiens
        
    """
    # 3 lists of proteins were given, the proteins listed in the paper, statistically top 25, and the proteins involved in the patwhay
    paper_proteins = ["CTNNB1", "LEF1", "CTNNA2", "EPHA7", "LRP4", "NOTUM", "DKK4", "JAG1", "PSEN2", "RBPJ", "HELZ2", "KIAA0513", "LSP1", 
    "VWA2", "CXCL14", "GNE", "GTF2F1", "TLK1", "ZNF638", "HDAC2", "HDAC5","NCSTN", "NUMB", "AXIN1", "FZD1", "GNAI1", "TP53"]
    
    top_25 = ["VWA2", "LRP4", "CTNNB1", "GLCE", "ACSL5", "NOTUM", "APCDD1", "DKK4", "EPHA7", "CTNNA2", "ADAMTSL2", "CALML3", 
              "CEMIP2", "AMOT", "CXCL14", "PLA2G4A", "RCN2", "TTC9", "FABP4", "GPCPD1", "VSNL1", "CRYBB1", "LEF1", "PDZD8", "FNDC3A"]
    
    bcat_pathway = ["ADAM17", "AXIN1", "AXIN2", "CCND2", "CSNK1E", "CTNNB1", "CUL1", "DKK1", "DKK4", "DLL1", "DVL2", "FRAT1", "FZD1",
                    "FZD8", "GNAI1", "HDAC11", "HDAC2", "HDAC5", "HEY1", "HEY2", "JAG1", "JAG2", "KAT2A", "LEF1", "MAML1", "MYC", "NCOR2",
                    "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53", "WNT1", "WNT5B", "WNT6"]

    # found unique defined as proteins that are in the top 25 list but not in the paper
    unique = [x for x in top_25 if x not in paper_proteins]

    return paper_proteins, top_25, bcat_pathway, unique
    
paper_proteins, top_25, bcat_pathway, unique = get_unique_proteins()

## Using CTNNB1 to Get INDRA Statements
Exploring the INDRA relationships between CTNNB1 and the list of proteins as a result, then filtering the INDRA statements for the filtered genes to see if CTNNB1 directly interacts with the protein 

In [89]:
def find_indra_relationships():
    """
    Parameters
    ----------
    none

    Returns
    -------
    combined_df: dataframe that contains INDRA relationships for CTNNB1 filtered by "unique" genes
    protein_df: unfiltered dataframe that contains all INDRA relationships for CTNNB1
        
    """
    # cypher to get dataframe with all proteins that have INDRA relationship with CTNNB1
    target_protein = "CTNNB1"
    cypher = f"MATCH p=(n:BioEntity)-[r:indra_rel]->(m:BioEntity) WHERE n.name = '{target_protein}' RETURN m.name, r.stmt_json, m.type, m.id, r.stmt_type"
    proteins = client.query_tx(cypher)
    protein_df = pd.DataFrame(proteins, columns=["name", "stmt_json", "type", "id", "type"])

    df_list = []
    protein_list = protein_df["name"].values
    
    # filters the dataframe that contains all INDRA relationships for CTNNB1 for genes in the "unique" list
    for gene in unique:
        if gene in protein_list:
           df_list.append(protein_df[protein_df["name"] == gene])
    # combines dataframes for each gene into single dataframe
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df, protein_df
combined_df, protein_df = find_indra_relationships()
combined_df

Unnamed: 0,name,stmt_json,type,id,type.1
0,GLCE,"{""type"": ""IncreaseAmount"", ""subj"": {""name"": ""C...",human_gene_protein,hgnc:17855,IncreaseAmount
1,GLCE,"{""type"": ""IncreaseAmount"", ""subj"": {""name"": ""C...",human_gene_protein,hgnc:17855,IncreaseAmount
2,APCDD1,"{""type"": ""Complex"", ""members"": [{""name"": ""APCD...",human_gene_protein,hgnc:15718,Complex
3,CALML3,"{""type"": ""Activation"", ""subj"": {""name"": ""CTNNB...",human_gene_protein,hgnc:1452,Activation
4,AMOT,"{""type"": ""Complex"", ""members"": [{""name"": ""AMOT...",human_gene_protein,hgnc:17810,Complex
5,AMOT,"{""type"": ""Complex"", ""members"": [{""name"": ""BCAR...",human_gene_protein,hgnc:17810,Complex
6,FABP4,"{""type"": ""Complex"", ""members"": [{""name"": ""CTNN...",human_gene_protein,hgnc:3559,Complex
7,FABP4,"{""type"": ""Activation"", ""subj"": {""name"": ""CTNNB...",human_gene_protein,hgnc:3559,Activation
8,FABP4,"{""type"": ""Inhibition"", ""subj"": {""name"": ""CTNNB...",human_gene_protein,hgnc:3559,Inhibition
9,FABP4,"{""type"": ""DecreaseAmount"", ""subj"": {""name"": ""C...",human_gene_protein,hgnc:3559,DecreaseAmount


In [91]:
# method to get INDRA statements for proteins of interest 
def get_indra_statements():
    json_list = combined_df["stmt_json"].values
    protein_names = combined_df["name"].values
    
    # iterates through the gene name and json strings for each gene 
    for name, strings, index in zip(protein_names, json_list, range(len(protein_names))):
        stmt_jsons = []
        # iterates through the individual json string within the statements for each gene 
        # and converts it to an INDRA statement object
        stmt_jsons.append(json.loads(strings))
        stmts = stmts_from_json(json_in=stmt_jsons)
    
        # uses HtmlAssembler to get html pages of INDRA statements for each gene 
        ha = HtmlAssembler(stmts, title='Statements for %s' % name, db_rest_url='https://db.indra.bio')
        ha.save_model('%s_statements.html' % (name+str(index)))
get_indra_statements()

## Finding Shared Pathways Between CTNNB1 and Unique Genes
Do any of the unique proteins belong to the Wnt/B-catenin pathway? If not, do any of them have shared pathways with CTNNB1? What are the implications of having shared pathways?

In [106]:
# method to get gene ids for protiens of interest
def get_gene_ids():
    """
    Parameters
    ----------
    none

    Returns
    -------
    id_df: dataframe that contains gene ids for unique protein list
        
    """
    id_df_list = []
    # iterates through the gene names
    for names in unique:
        # cypher query to get the gene ids 
        cypher = f"MATCH p=(n:BioEntity) WHERE n.name = '{names}' AND n.id starts with 'hgnc' RETURN n.name, n.id"
        results = client.query_tx(cypher)
        # save and loads results into a dataframe for each gene id
        id_df_list.append(pd.DataFrame(results, columns=["name", "gene_id"]))
    # combines the dataframes into a single dataframe
    id_df = pd.concat(id_df_list, ignore_index=True)  
    return id_df
id_df = get_gene_ids()
id_df

Unnamed: 0,name,gene_id
0,GLCE,hgnc:17855
1,ACSL5,hgnc:16526
2,APCDD1,hgnc:15718
3,ADAMTSL2,hgnc:14631
4,CALML3,hgnc:1452
5,CEMIP2,hgnc:11869
6,AMOT,hgnc:17810
7,PLA2G4A,hgnc:9034
8,PLA2G4A,hgnc:9035
9,RCN2,hgnc:9935


In [9]:
def shared_pathway():
    """
    Parameters
    ----------
    none

    Returns
    -------
    none
        
    """
    # iterates through ids and names of unique genes 
    for ids, names in zip(id_df["gene_id"].values, id_df["name"].values):
        # gets the numerical part of the string
        id = ids[5:]
        print("These are the shared pathways for", names, "and CTNNB1:") 
        print(get_shared_pathways_for_genes((("HGNC", id),("HGNC", "2514"))))
shared_pathway()

INFO: [2024-07-10 02:12:24] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:24] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


These are the shared pathways for GLCE and CTNNB1:


INFO: [2024-07-10 02:12:24] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:24] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for ACSL5 and CTNNB1:


INFO: [2024-07-10 02:12:25] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:25] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for APCDD1 and CTNNB1:


INFO: [2024-07-10 02:12:25] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:25] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for ADAMTSL2 and CTNNB1:


INFO: [2024-07-10 02:12:26] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:26] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'WIKIPATHWAYS:WP4816', name:'TGF beta receptor signaling in skeletal dysplasias', version:'20240101' }), (:BioEntity { id:'REACTOME:R-HSA-1643685', name:'Disease', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-392499', name:'Metabolism of proteins', version:'87' })]
These are the shared pathways for CALML3 and CTNNB1:


INFO: [2024-07-10 02:12:26] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:26] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'WIKIPATHWAYS:WP5124', name:'Alzheimer 39 s disease', version:'20240101' }), (:BioEntity { id:'WIKIPATHWAYS:WP2059', name:'Alzheimer 39 s disease and miRNA effects', version:'20240101' })]
These are the shared pathways for CEMIP2 and CTNNB1:


INFO: [2024-07-10 02:12:27] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:27] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for AMOT and CTNNB1:


INFO: [2024-07-10 02:12:27] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:27] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'WIKIPATHWAYS:WP5087', name:'Pleural mesothelioma', version:'20240101' }), (:BioEntity { id:'WIKIPATHWAYS:WP4541', name:'Hippo Merlin signaling dysregulation', version:'20240101' }), (:BioEntity { id:'WIKIPATHWAYS:WP3888', name:'VEGFA VEGFR2 signaling', version:'20240101' }), (:BioEntity { id:'REACTOME:R-HSA-9764260', name:'Regulation of Expression and Function of Type II Classical Cadherins', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-9762292', name:'Regulation of CDH11 function', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-9759476', name:'Regulation of Homotypic Cell-Cell Adhesion', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-9759475', name:'Regulation of CDH11 Expression and Function', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-162582', name:'Signal Transduction', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-1500931', name:'Cell-Cell communication', version:'87' }), (:BioEntity { id:'REACTOME:R-HSA-446728', name:'Cell junction organiz

INFO: [2024-07-10 02:12:28] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:28] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for PLA2G4A and CTNNB1:


INFO: [2024-07-10 02:12:28] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:28] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'WIKIPATHWAYS:WP3888', name:'VEGFA VEGFR2 signaling', version:'20240101' }), (:BioEntity { id:'REACTOME:R-HSA-162582', name:'Signal Transduction', version:'87' })]
These are the shared pathways for RCN2 and CTNNB1:


INFO: [2024-07-10 02:12:29] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:29] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for TTC9 and CTNNB1:


INFO: [2024-07-10 02:12:29] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:29] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for FABP4 and CTNNB1:


INFO: [2024-07-10 02:12:30] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:30] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'WIKIPATHWAYS:WP5087', name:'Pleural mesothelioma', version:'20240101' }), (:BioEntity { id:'REACTOME:R-HSA-1266738', name:'Developmental Biology', version:'87' })]
These are the shared pathways for GPCPD1 and CTNNB1:


INFO: [2024-07-10 02:12:30] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:30] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for VSNL1 and CTNNB1:


INFO: [2024-07-10 02:12:31] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:31] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for CRYBB1 and CTNNB1:


INFO: [2024-07-10 02:12:32] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:32] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for PDZD8 and CTNNB1:


INFO: [2024-07-10 02:12:32] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:32] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
These are the shared pathways for FNDC3A and CTNNB1:
[]


## Further Pathway Exploration

Explores the proteins that belong to the specific Wnt/B-catenin pathway, and finds intersections of proteins between the 4 lists given:
Unique proteins (the proteins that belong to the top_25 list but do not appear in the paper, top_25, paper proteins, and Wnt pathway (proteins that belong to the pathway created off Google Search)

In [6]:
# will use the first pathway as it specifies the formation of beta-catenin
get_shared_pathways_for_genes((("HGNC", "2514"),("HGNC", "2514")))

INFO: [2024-07-10 02:12:23] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:12:23] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'REACTOME:R-HSA-201722', name:'Formation of the beta-catenin:TCF transactivating complex', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-201681', name:'TCF dependent signaling in response to WNT', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-195721', name:'Signaling by WNT', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-9764260', name:'Regulation of Expression and Function of Type II Classical Cadherins', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-9762292', name:'Regulation of CDH11 function', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-9759476', name:'Regulation of Homotypic Cell-Cell Adhesion', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-9759475', name:'Regulation of CDH11 Expression and Function', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-162582', name:'Signal Transduction', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-1500931', name:'Cell-Cell communication', version:'87' }),
 (:BioEntity { id:'REACTOME:R-HSA-12

In [93]:
# this method finds shared proteins between 4 lists (unique, top_25, paper_proteins and bcat_pathway) and genes from
# the Formation of the beta-catenin:TCF transactivating complex pathway (REACTOME:R-HSA-201722)

def find_intersections():
    """
    Parameters
    ----------
    none

    Returns
    -------
    none
        
    """
    # gets all the genes part of the specific wnt-signaling pathway that forms b-catenin
    conv_genes = get_genes_for_pathway(("REACTOME", "R-HSA-201722"))
    
    wnt_path_genes = []
    # iterates through the genes in the list
    for genes in conv_genes:
        # changes the type to string and splits it
        text = str(genes)
        words = text.split()  
        # iterates through each word in the list of strings
        for word in words:
            # if statement to get just the gene name
            if word.startswith("name:"):
                wnt_path_genes.append(word[6:-2])

    # now uses .intersection() with each of the 4 lists to find intersections
    print(set(unique).intersection(set(wnt_path_genes)))
    print(set(top_25).intersection(set(wnt_path_genes)))
    print(set(paper_proteins).intersection(set(wnt_path_genes)))
    print(set(bcat_pathway).intersection(set(wnt_path_genes)))

find_intersections()

INFO: [2024-07-10 08:27:46] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:27:46] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


set()
{'LEF1', 'CTNNB1'}
{'LEF1', 'CTNNB1'}
{'CTNNB1', 'AXIN2', 'LEF1', 'MYC', 'TCF7'}


## Finding Same Protein Families/Complexes

Determining if unique proteins belong to the same protein families or complexes as CTNNB1

In [84]:
# query that did not work
#for ids in id_df["gene_id"].values:
        #cypher = f"MATCH p=(n:BioEntity)-[r:haspart]->(family:BioEntity)<-[r2:haspart]-(m:BioEntity) WHERE n.id = {ids} AND m.id = 'hgnc:2514' RETURN family"
        #proteins = client.query_tx(cypher)

def finding_complexes0():
    #iterates through the ids and names of the unique proteins 
    for ids, names in zip(id_df["gene_id"].values, id_df["name"].values):
        # gets the numerical part of the string only
        id = ids[5:]
        print("Are", names, "and CTNNB1 part of the same family") 
        # uses isa_or_partof() to determine if protein is a child of CTNNB1
        print(isa_or_partof(("HGNC", id),("HGNC", "2514")))
       
finding_complexes0()

INFO: [2024-07-10 08:06:53] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:53] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


Are GLCE and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:53] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:53] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are ACSL5 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:54] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:54] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are APCDD1 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:54] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:54] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are ADAMTSL2 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:55] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:55] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are CALML3 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:55] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:55] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are CEMIP2 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:56] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:56] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are AMOT and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:56] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:56] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are PLA2G4A and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:57] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:57] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are PLA2G4A and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:57] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:57] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are RCN2 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:58] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:58] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are TTC9 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:58] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:58] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are FABP4 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:59] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:59] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are GPCPD1 and CTNNB1 part of the same family


INFO: [2024-07-10 08:06:59] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:06:59] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are VSNL1 and CTNNB1 part of the same family


INFO: [2024-07-10 08:07:00] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:07:00] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are CRYBB1 and CTNNB1 part of the same family


INFO: [2024-07-10 08:07:00] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:07:00] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are PDZD8 and CTNNB1 part of the same family


INFO: [2024-07-10 08:07:01] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:07:01] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


False
Are FNDC3A and CTNNB1 part of the same family
False


## Exploring Protein Families/Complexes Further

Potential go ids of interest for CTNNB1: (:BioEntity { id:'GO:GO:0030877', name:'beta-catenin destruction complex', type:'protein_family_complex' }),
(:BioEntity { id:'GO:GO:0032991', name:'protein-containing complex', type:'protein_family_complex' }),
(:BioEntity { id:'GO:GO:0032993', name:'protein-DNA complex', type:'protein_family_complex' }),
(:BioEntity { id:'GO:GO:0060070', name:'canonical Wnt signaling pathway', type:'biological_process' }),
(:BioEntity { id:'GO:GO:0070369', name:'beta-catenin-TCF7L2 complex', type:'protein_family_complex' }),
(:BioEntity { id:'GO:GO:1990907', name:'beta-catenin-TCF complex', type:'protein_family_complex' }),

In [21]:
def get_go_terms_for_target():
    """
    Parameters
    ----------
    none

    Returns
    -------
    ctnnb1_go: contains list of GO terms for CTNNB1
    go_nodes: contains list of node objects that has information about GO terms for CTNNB1
        
    """
    # these are the GO terms for CTNNB1
    go_nodes = get_go_terms_for_gene(("HGNC", "2514"))
    ctnnb1_go = []
    # iterates through the genes in the list
    for genes in go_nodes:
        # changes the type to string and splits it
        text = str(genes)
        words = text.split()  
        # iterates through each word in the list of strings
        for word in words:
            # if statement to get just the gene name
            if word.startswith("id:"):
                ctnnb1_go.append(word[7:-2].lower())
    return ctnnb1_go, go_nodes
ctnnb1_go,go_nodes = get_go_terms_for_target()


INFO: [2024-07-10 02:29:32] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 02:29:32] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


In [94]:
# this method is not necessary because the INDRA gene analysis provides GO terms among genes
def go_terms_for_unique():
    #iterates through the ids and names of the unique proteins 
    for ids, names in zip(id_df["gene_id"].values, id_df["name"].values):
        # gets the numerical part of the string only
        id = ids[5:]
        print("GO terms for", names) 
        print(get_go_terms_for_gene(("HGNC", id)))
go_terms_for_unique()


INFO: [2024-07-10 08:32:47] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:47] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


GO terms for GLCE


INFO: [2024-07-10 08:32:47] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:47] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005794', name:'Golgi apparatus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0015012', name:'heparan sulfate proteoglycan biosynthetic process', type:'biological_process' }), (:BioEntity { id:'GO:GO:0042803', name:'protein homodimerization activity', type:'biological_process' }), (:BioEntity { id:'GO:GO:0047464', name:'heparosan-N-sulfate-glucuronate 5-epimerase activity', type:'biological_process' })]
GO terms for ACSL5


INFO: [2024-07-10 08:32:48] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:48] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0001676', name:'long-chain fatty acid metabolic process', type:'biological_process' }), (:BioEntity { id:'GO:GO:0004467', name:'long-chain fatty acid-CoA ligase activity', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005654', name:'nucleoplasm', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005730', name:'nucleolus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005739', name:'mitochondrion', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005783', name:'endoplasmic reticulum', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005886', name:'plasma membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0010747', name:'positive regulation of long-chain fatty acid import across plasma membrane', type:'biological_process' }), (:BioEntity { id:'GO:GO:0016020', name:'membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:00353

INFO: [2024-07-10 08:32:48] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:48] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0001942', name:'hair follicle development', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005886', name:'plasma membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0017147', name:'Wnt-protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0030178', name:'negative regulation of Wnt signaling pathway', type:'biological_process' }), (:BioEntity { id:'GO:GO:0042802', name:'identical protein binding', type:'biological_process' })]
GO terms for ADAMTSL2


INFO: [2024-07-10 08:32:49] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:49] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0030512', name:'negative regulation of transforming growth factor beta receptor signaling pathway', type:'biological_process' }), (:BioEntity { id:'GO:GO:0031012', name:'extracellular matrix', type:'cellular_location' })]
GO terms for CALML3


INFO: [2024-07-10 08:32:49] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:49] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0070062', name:'extracellular exosome', type:'cellular_location' })]
GO terms for CEMIP2


INFO: [2024-07-10 08:32:50] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:50] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005730', name:'nucleolus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005829', name:'cytosol', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005886', name:'plasma membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0043231', name:'intracellular membrane-bounded organelle', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0045296', name:'cadherin binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0070062', name:'extracellular exosome', type:'cellular_location' })]
GO terms for AMOT


INFO: [2024-07-10 08:32:51] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:51] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0001525', name:'angiogenesis', type:'biological_process' }), (:BioEntity { id:'GO:GO:0001726', name:'ruffle', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0003365', name:'establishment of cell polarity involved in ameboidal cell migration', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005654', name:'nucleoplasm', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005884', name:'actin filament', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005886', name:'plasma membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005923', name:'bicellular tight junction', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0008180', name:'COP9 signalosome', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0009897', name:'external side of plasma membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0016525', name:'negative regulation of 

INFO: [2024-07-10 08:32:51] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:51] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
GO terms for PLA2G4A


INFO: [2024-07-10 08:32:52] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:52] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0000139', name:'Golgi membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0001516', name:'prostaglandin biosynthetic process', type:'biological_process' }), (:BioEntity { id:'GO:GO:0004622', name:'lysophospholipase activity', type:'biological_process' }), (:BioEntity { id:'GO:GO:0004623', name:'phospholipase A2 activity', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005544', name:'calcium-dependent phospholipid binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005634', name:'nucleus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005635', name:'nuclear envelope', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005737', name:'cytoplasm', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005783', name:'endoplasmic reticulum', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005794', name:'Golgi apparatus', type:'c

INFO: [2024-07-10 08:32:52] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:52] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005730', name:'nucleolus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005783', name:'endoplasmic reticulum', type:'cellular_location' })]
GO terms for TTC9


INFO: [2024-07-10 08:32:53] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:53] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[]
GO terms for FABP4


INFO: [2024-07-10 08:32:53] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:53] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005504', name:'fatty acid binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005634', name:'nucleus', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0005829', name:'cytosol', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0015908', name:'fatty acid transport', type:'biological_process' }), (:BioEntity { id:'GO:GO:0070062', name:'extracellular exosome', type:'cellular_location' })]
GO terms for GPCPD1


INFO: [2024-07-10 08:32:54] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:54] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0046475', name:'glycerophospholipid catabolic process', type:'biological_process' }), (:BioEntity { id:'GO:GO:0047389', name:'glycerophosphocholine phosphodiesterase activity', type:'biological_process' })]
GO terms for VSNL1


INFO: [2024-07-10 08:32:55] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:55] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005509', name:'calcium ion binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' })]
GO terms for CRYBB1


INFO: [2024-07-10 08:32:55] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:55] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0002088', name:'lens development in camera-type eye', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005212', name:'structural constituent of eye lens', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0007601', name:'visual perception', type:'biological_process' })]
GO terms for PDZD8


INFO: [2024-07-10 08:32:56] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2024-07-10 08:32:56] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


[(:BioEntity { id:'GO:GO:0005515', name:'protein binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005789', name:'endoplasmic reticulum membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0007010', name:'cytoskeleton organization', type:'biological_process' }), (:BioEntity { id:'GO:GO:0016020', name:'membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0022604', name:'regulation of cell morphogenesis', type:'biological_process' }), (:BioEntity { id:'GO:GO:0044233', name:'mitochondria-associated endoplasmic reticulum membrane', type:'cellular_location' }), (:BioEntity { id:'GO:GO:0051560', name:'mitochondrial calcium ion homeostasis', type:'biological_process' }), (:BioEntity { id:'GO:GO:1990456', name:'mitochondrion-endoplasmic reticulum membrane tethering', type:'biological_process' })]
GO terms for FNDC3A
[(:BioEntity { id:'GO:GO:0003723', name:'RNA binding', type:'biological_process' }), (:BioEntity { id:'GO:GO:0005737', name:'cytoplasm', type:'

## Exploring Gene Enrichment
Has these aspects of upstream entities, go terms, and reactome/wikipathways for analysis across the list of proteins, but not in relationship to CTNNB1. Already analyzed reactome/wikipathways, but not upstream and GO terms. For the upstream dataframe, some of the  chemicals have the same “footprint” as b-catenin, and also contains proteins that are intermediates of the wnt signaling pathway and regulate proteins of interest 

In [103]:
# this method uses the indra_upstream csv to get a dataframe that is the intersection of the upstream molecules 
# and the bioentities that CTNNB1 has direct INDRA relationships with

def shared_entities():
    """
    Parameters
    ----------
    none

    Returns
    -------
    shared_proteins: list of shared bioentities between the indra_upstream results and bioenties that
    have direct INDRA relationships with CTNNB1
    shared_indra: dataframe that is the filtered the indra_upstream_df using the shared_protiens list 
                  (you can pick whether you want to filter the indra_upstream_df or protein_df which 
                   contains all bioentities that CTNNB1 has a direct INDRA relationship with)
        
    """
    # downloaded the gene list analysis as a csv
    indra_upstream_df = pd.read_csv("/Users/ariaagarwal/Desktop/discrete.csv")
    
    # list that are shared entities between indra_upstream for gene set and proteins that have a direct 
    # INDRA relationship with CTNNB1
    shared_proteins = list((set(indra_upstream_df["Name"].values)).intersection(set(protein_df["name"].values)))
    df_list = []
    for i, j in enumerate(shared_proteins):
        # can pick if you want to filter from protein_df (which has proteins that have INDRA relationships to CTNNB1). 
        # or indra_upstream_df 
            df_list.append(indra_upstream_df[indra_upstream_df["Name"] == shared_proteins[i]])
            shared_indra = pd.concat(df_list)
    shared_indra = shared_indra.reset_index()

    # code if want to filter for specific type of bioentity ex: protein_family_complex, small_molecule ect.
    
     #for num, type in enumerate(shared_indra["type"].values):
        #if type[0] == "protein_family_complex":
            #print(shared_indra.iloc[num])
    
    return shared_proteins, shared_indra
    
shared_proteins, shared_indra = shared_entities()
shared_indra

Unnamed: 0,index,CURIE,Name,p-value,q-value
0,113,hgnc:21231,DACT2,6.050000e-05,0.026000
1,69,hgnc:11838,TLE2,1.590000e-05,0.010800
2,24,chebi:4031,cyclosporin A,2.030000e-06,0.003950
3,88,hgnc:3236,EGFR,2.920000e-05,0.016100
4,5,hgnc:6551,LEF1,1.540000e-08,0.000126
...,...,...,...,...,...
101,58,hgnc:16265,WNT5B,9.650000e-06,0.007900
102,148,hgnc:9449,PRNP,1.130000e-04,0.037200
103,153,hgnc:8528,OXT,1.240000e-04,0.039200
104,122,hgnc:2500,CCN2,7.770000e-05,0.030200


In [104]:
# this method finds the shared go terms between what the analysis returns and CTNNB1s GO terms
# the csv did have other rows that mention the wnt signaling pathway which could be of interest

def finding_protein_complexes():
    """
    Parameters
    ----------
    none

    Returns
    -------
    shared_df = dataframe that contains shared bioentities that have the same go terms 
    between the GO terms provided from the gene analysis and GO terms associated with CTNNB1
        
    """
    go_terms_df = pd.read_csv("/Users/ariaagarwal/Desktop/goterms.csv")
    df_list = []
    shared_go = list((set(go_terms_df["CURIE"]).intersection(set(ctnnb1_go))))
    for i, j in enumerate(shared_go):
        df_list.append(go_terms_df[go_terms_df["CURIE"] == shared_go[i]])
    shared_df = pd.concat(df_list)
    return shared_df
shared_df = finding_protein_complexes()
shared_df


Unnamed: 0,CURIE,Name,p-value,q-value
1,go:0005515,protein binding,3.18e-07,0.00219
11,go:1990907,beta-catenin-TCF complex,2.25e-05,0.0259


In [105]:
# combined dataframe of REACTOME and Wikipathways provided by gene analysis for gene list
# did not perform analysis because shared pathways was already explored 

reactome_df = pd.read_csv("/Users/ariaagarwal/Desktop/reactome.csv")
wikipathways_df = pd.read_csv("/Users/ariaagarwal/Desktop/wikipathways.csv")
pathways_df = pd.concat([reactome_df, wikipathways_df])
pathways_df

Unnamed: 0,CURIE,Name,p-value,q-value
0,reactome:R-HSA-4411364,Binding of TCF/LEF:CTNNB1 to target gene promo...,8.1e-06,0.00912
1,reactome:R-HSA-8951430,RUNX3 regulates WNT signaling,8.1e-06,0.00912
2,reactome:R-HSA-1483115,Hydrolysis of LPC,1.04e-05,0.00912
3,reactome:R-HSA-9762292,Regulation of CDH11 function,1.59e-05,0.0104
4,reactome:R-HSA-9796292,Formation of axial mesoderm,2.63e-05,0.0138
5,reactome:R-HSA-9754189,Germ layer formation at gastrulation,3.92e-05,0.0172
6,reactome:R-HSA-9733709,Cardiogenesis,0.000101,0.0319
7,reactome:R-HSA-9759475,Regulation of CDH11 Expression and Function,0.000109,0.0319
8,reactome:R-HSA-525793,Myogenesis,0.000117,0.0319
9,reactome:R-HSA-9764260,Regulation of Expression and Function of Type ...,0.000134,0.0319
