In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import os
from neo4j import GraphDatabase, basic_auth
from tqdm import tqdm


In [2]:
SPOKE_USER = "neo4j"
SPOKE_PASSWORD = "SPOKEdev"
URI = "bolt://spokedev.cgl.ucsf.edu/:7687"


In [219]:
def get_metapath_count(source_node, target_node, metapath_data):
    nhop = metapath_data['nhops']
    column_names = [f"col_{i}" for i in range(1, nhop+1)]
    place_holder_values = (source_node,) + tuple(metapath_data[col] for col in column_names) + (target_node,)    
    cypher = 'MATCH path=(o:Organism{name:"%s"})-[:%s]->(n1)'
    for i in range(nhop-2):
        cypher += '-[:%s]-(n{})'.format(i+2)
    cypher += '-[:%s]-(c:Compound{identifier:"%s"}) RETURN path'
    cypher = cypher%(place_holder_values)
    auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
    sdb = GraphDatabase.driver(URI, auth=auth)
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            result = tx.run(cypher)
            metapath_count = []
            for row in result:
                metapath_count.append(row['path'])
    sdb.close()
    return len(metapath_count)


In [3]:
# selected_metapaths = [
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'HAS_PhEC',
#         'col_3' : 'CATALYZES_ECcR',
#         'col_4' : 'PRODUCES_RpC'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'HAS_PhEC',
#         'col_3' : 'CATALYZES_ECcR',
#         'col_4' : 'CONSUMES_RcC'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'ENCODES_PGeP',
#         'col_3' : 'ENCODES_PGeP',
#         'col_4' : 'BINDS_CbP'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'INTERACTS_PiP',
#         'col_3' : 'BINDS_CbP'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'INTERACTS_PiP',
#         'col_3' : 'HAS_PhEC',
#         'col_4' : 'CATALYZES_ECcR',
#         'col_5' : 'PRODUCES_RpC'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'INTERACTS_PiP',
#         'col_3' : 'HAS_PhEC',
#         'col_4' : 'CATALYZES_ECcR',
#         'col_5' : 'CONSUMES_RcC'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'PARTOF_PDpP',
#         'col_3' : 'BINDS_CbPD'
#     },
#     {
#         'col_1' : 'ENCODES_OeP',
#         'col_2' : 'BINDS_CbP'
#     }
# ]

# org_cmp_metapath = pd.DataFrame(selected_metapaths)
# org_cmp_metapath.loc[:, "nhops"] = org_cmp_metapath.count(axis=1)

In [249]:

cmp_df = pd.read_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/mappings/bcmm_compounds_combined_refined.csv'))
org_df = pd.read_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/mappings/spoke_bacteria_species.tsv'), sep='\t')
microbe_from_mimedb_df = pd.read_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/mimedb_microbes_v1.csv'))


In [190]:
%%time

# Compound-Reaction edges

query = '''
    MATCH(r:Reaction)-[r1]->(c:Compound)
    WHERE TYPE(r1) IN ['CONSUMES_RcC', 'PRODUCES_RpC']
    AND c.identifier IN {}
    RETURN r.identifier AS r_id, TYPE(r1) as rel_type, c.identifier AS c_id
'''.format(list(cmp_df.spoke_identifier))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        rxn_cmp = []
        for row in result:
            rxn_cmp.append((row['r_id'], row['rel_type'], row['c_id']))
sdb.close()
rxn_cmp_df = pd.DataFrame(rxn_cmp, columns=['source', 'relation', 'target'])

CPU times: user 79.5 ms, sys: 11.9 ms, total: 91.4 ms
Wall time: 388 ms


In [58]:
%%time

query = '''
    MATCH(e:EC)-[r1:CATALYZES_ECcR]->(r:Reaction)
    WHERE r.identifier IN {}
    RETURN e.identifier AS e_id, TYPE(r1) AS rel_type, r.identifier AS r_id
'''.format(list(rxn_cmp_df.source))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        ec_rxn = []
        for row in result:
            ec_rxn.append((row['e_id'], row['rel_type'], row['r_id']))
sdb.close()
ec_rxn_df = pd.DataFrame(ec_rxn, columns=['source', 'relation', 'target'])


CPU times: user 50.4 ms, sys: 16.4 ms, total: 66.7 ms
Wall time: 387 ms


In [60]:
%%time

# Protein-EC

query = '''
    MATCH(p:Protein)-[r1:HAS_PhEC]->(e:EC)
    WHERE e.identifier IN {}
    RETURN p.identifier AS p_id, TYPE(r1) AS rel_type, e.identifier AS e_id
'''.format(list(ec_rxn_df.source))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        prot_ec = []
        for row in result:
            prot_ec.append((row['p_id'], row['rel_type'], row['e_id']))
sdb.close()
prot_ec_df = pd.DataFrame(prot_ec, columns=['source', 'relation', 'target'])


CPU times: user 4.37 s, sys: 536 ms, total: 4.91 s
Wall time: 12.1 s


In [161]:
%%time

# Protein-EC

query = '''
    MATCH(p:Protein)-[r1:HAS_PhEC]->(e:EC)
    RETURN p.identifier AS p_id, TYPE(r1) AS rel_type, e.identifier AS e_id
'''

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        prot_ec_total = []
        for row in result:
            prot_ec_total.append((row['p_id'], row['rel_type'], row['e_id']))
sdb.close()
prot_ec_total_df = pd.DataFrame(prot_ec_total, columns=['source', 'relation', 'target'])


CPU times: user 1min 59s, sys: 14.3 s, total: 2min 13s
Wall time: 5min 15s


In [167]:
%%time

# EC_Rxn

query = '''
    MATCH(e:EC)-[r1:CATALYZES_ECcR]->(r:Reaction)
    RETURN e.identifier AS e_id, TYPE(r1) AS rel_type, r.identifier AS r_id
'''

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        ec_rxn_total = []
        for row in result:
            ec_rxn_total.append((row['e_id'], row['rel_type'], row['r_id']))
sdb.close()
ec_rxn_total_df = pd.DataFrame(ec_rxn_total, columns=['source', 'relation', 'target'])


CPU times: user 440 ms, sys: 56.7 ms, total: 497 ms
Wall time: 1.12 s


In [36]:
%%time

# Protein-Compound edges

query = '''
    MATCH(r:Protein)<-[r1:BINDS_CbP]-(c:Compound)
    WHERE r.org_ncbi_id IN {} AND c.identifier IN {}
    RETURN r.identifier AS r_id, TYPE(r1) as rel_type, c.identifier AS c_id
'''.format(list(org_df.spoke_id.astype('str')), list(cmp_df.spoke_identifier))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        prot_cmp = []
        for row in result:
            prot_cmp.append((row['r_id'], row['rel_type'], row['c_id']))
sdb.close()
prot_cmp_df = pd.DataFrame(prot_cmp, columns=['source', 'relation', 'target'])
prot_cmp_df = prot_cmp_df.rename(columns={'source': 'target', 'target':'source'})


CPU times: user 19.8 ms, sys: 4.49 ms, total: 24.3 ms
Wall time: 798 ms


In [40]:
%%time

# Organism-Protein

query = '''
    MATCH(r:Organism)-[r1:ENCODES_OeP]->(c:Protein)
    WHERE r.identifier IN {}
    RETURN r.identifier AS r_id, TYPE(r1) as rel_type, c.identifier AS c_id
'''.format(list(org_df.spoke_id))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        org_prot = []
        for row in result:
            org_prot.append((row['r_id'], row['rel_type'], row['c_id']))
sdb.close()
org_prot_df = pd.DataFrame(org_prot, columns=['source', 'relation', 'target'])


CPU times: user 8min 46s, sys: 1min 5s, total: 9min 52s
Wall time: 24min 59s


In [101]:
%%time

# Protein-Protein

query = '''
    MATCH(p1:Protein)-[r1:INTERACTS_PiP]->(p2:Protein)
    RETURN p1.identifier AS p1_id, TYPE(r1) as rel_type, p2.identifier AS p2_id
'''

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        prot_prot = []
        for row in result:
            prot_prot.append((row['p1_id'], row['rel_type'], row['p2_id']))
sdb.close()
prot_prot_df = pd.DataFrame(prot_prot, columns=['source', 'relation', 'target'])


CPU times: user 49.8 s, sys: 5.98 s, total: 55.7 s
Wall time: 2min 13s


In [92]:
%%time

# Organism-Organism

query = '''
    MATCH(r:Organism)<-[r1:ISA_OiO]-(c:Organism)
    WHERE r.identifier IN {}
    RETURN r.identifier AS r_id, TYPE(r1) as rel_type, c.identifier AS c_id
'''.format(list(org_df.spoke_id))

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        org_org = []
        for row in result:
            org_org.append((row['r_id'], row['rel_type'], row['c_id']))
sdb.close()
org_org_df = pd.DataFrame(org_org, columns=['source', 'relation', 'target'])


CPU times: user 80 ms, sys: 17.2 ms, total: 97.2 ms
Wall time: 919 ms


In [55]:
%%time

# Metapath : ENCODES_OeP -> BINDS_CbP

org_prot_cmp_df = pd.merge(org_prot_df, prot_cmp_df, on='target')

CPU times: user 13.4 s, sys: 2.6 s, total: 16 s
Wall time: 16.7 s


In [333]:
%%time

# Metapath : ENCODES_OeP -> HAS_PhEC -> CATALYZES_ECcR ->CONSUMES/PRODUCES_Rc/pC

org_prot_ec_df = pd.merge(org_prot_df, prot_ec_df, left_on='target', right_on='source')
org_prot_ec_df.drop('source_y', axis=1, inplace=True)
org_prot_ec_rxn_df = pd.merge(org_prot_ec_df, ec_rxn_df, left_on='target_y', right_on='source')
org_prot_ec_rxn_df.drop('source', axis=1, inplace=True)
org_prot_ec_rxn_cmp_df = pd.merge(org_prot_ec_rxn_df, rxn_cmp_df, left_on='target', right_on='source')
org_prot_ec_rxn_cmp_df.drop('source', axis=1, inplace=True)
org_prot_ec_rxn_cmp_df.columns=['source_x', 'relation_x', 'target_x', 'relation_y', 'target_y', 'relation_z', 'target_z', 'relation_z2', 'target_z2']

org_prot_ec_rxn_cmp_df_consumes = org_prot_ec_rxn_cmp_df[org_prot_ec_rxn_cmp_df.relation_z2 == 'CONSUMES_RcC']
org_prot_ec_rxn_cmp_df_produces = org_prot_ec_rxn_cmp_df[org_prot_ec_rxn_cmp_df.relation_z2 == 'PRODUCES_RpC']



CPU times: user 13.5 s, sys: 2.57 s, total: 16.1 s
Wall time: 16.6 s




In [194]:
%%time

# Metapath : ENCODES_OeP -> INTERACTS_PiP -> HAS_PhEC -> CATALYZES_ECcR ->CONSUMES/PRODUCES_Rc/pC


org_prot_prot_df_1 = pd.merge(org_prot_df, prot_prot_df, left_on='target', right_on='source')
org_prot_prot_df_1.drop('source_y', axis=1, inplace=True)
org_prot_prot_df_2 = pd.merge(org_prot_df, prot_prot_df, left_on='target', right_on='target')
org_prot_prot_df_2 = org_prot_prot_df_2[['source_x', 'relation_x', 'target', 'relation_y', 'source_y']]
org_prot_prot_df_2 = org_prot_prot_df_2.rename(columns={'target':'target_x', 'source_y':'target_y'})
org_prot_prot_df = pd.concat([org_prot_prot_df_1, org_prot_prot_df_2], ignore_index=True)

org_prot_prot_ec_df = pd.merge(org_prot_prot_df, prot_ec_total_df, left_on='target_y', right_on='source')
org_prot_prot_ec_rxn_df = pd.merge(org_prot_prot_ec_df, ec_rxn_total_df, left_on='target', right_on='source')
org_prot_prot_ec_rxn_df.columns = ['source_x', 'relation_x', 'target_x', 'relation_y', 'target_y', 'target_yy', 'relation_z', 'target_z', 'target_zz', 'relation_z2', 'target_z2']
org_prot_prot_ec_rxn_df.drop(['target_yy', 'target_zz'], axis=1, inplace=True)
org_prot_prot_ec_rxn_cmp_df = pd.merge(org_prot_prot_ec_rxn_df, rxn_cmp_df, left_on='target_z2', right_on='source')
org_prot_prot_ec_rxn_cmp_df.drop('source', axis=1, inplace=True)
org_prot_prot_ec_rxn_cmp_df = org_prot_prot_ec_rxn_cmp_df.rename(columns={'relation':'relation_z3', 'target':'target_z3'})

org_prot_prot_ec_rxn_cmp_df_consumes = org_prot_prot_ec_rxn_cmp_df[org_prot_prot_ec_rxn_cmp_df.relation_z3=='CONSUMES_RcC']
org_prot_prot_ec_rxn_cmp_df_produces = org_prot_prot_ec_rxn_cmp_df[org_prot_prot_ec_rxn_cmp_df.relation_z3=='PRODUCES_RpC']


CPU times: user 33.7 s, sys: 11.6 s, total: 45.3 s
Wall time: 48 s




In [495]:
final_compounds = list(set(list(org_prot_cmp_df.source_y.unique()) + list(org_prot_ec_rxn_cmp_df.target_z2.unique()) + list(org_prot_prot_ec_rxn_cmp_df.target_z3.unique())))
cmp_df_selected = cmp_df[cmp_df.spoke_identifier.isin(final_compounds)]

final_org = list(set(list(org_prot_cmp_df.source_x.unique()) + list(org_prot_ec_rxn_cmp_df.source_x.unique()) + list(org_prot_prot_ec_rxn_cmp_df.source_x.unique())))
org_df_selected = org_df[org_df.spoke_id.isin(final_org)]
# org_df_selected = org_df_selected[org_df_selected.spoke_id.isin(microbe_from_mimedb_df.ncbi_tax_id)]


In [335]:
selected_metapaths = [
    {
        'col_1' : 'ENCODES_OeP',
        'col_2' : 'BINDS_CbP',
    },
    {
        'col_1' : 'ENCODES_OeP',
        'col_2' : 'HAS_PhEC',
        'col_3' : 'CATALYZES_ECcR',
        'col_4' : 'CONSUMES_RcC'
    },
    {
        'col_1' : 'ENCODES_OeP',
        'col_2' : 'HAS_PhEC',
        'col_3' : 'CATALYZES_ECcR',
        'col_4' : 'PRODUCES_RpC'
    },
    {
        'col_1' : 'ENCODES_OeP',
        'col_2' : 'INTERACTS_PiP',
        'col_3' : 'HAS_PhEC',
        'col_4' : 'CATALYZES_ECcR',
        'col_5' : 'CONSUMES_RcC'
    },
    {
        'col_1' : 'ENCODES_OeP',
        'col_2' : 'INTERACTS_PiP',
        'col_3' : 'HAS_PhEC',
        'col_4' : 'CATALYZES_ECcR',
        'col_5' : 'PRODUCES_RpC'
    }
]

org_cmp_metapath = pd.DataFrame(selected_metapaths)
org_cmp_metapath.loc[:, "nhops"] = org_cmp_metapath.count(axis=1)

metapath_data_dict = {
    0 : org_prot_cmp_df,
    1 : org_prot_ec_rxn_cmp_df_consumes,
    2 : org_prot_ec_rxn_cmp_df_produces,
    3 : org_prot_prot_ec_rxn_cmp_df_consumes,
    4 : org_prot_prot_ec_rxn_cmp_df_produces
}
metapath_node_sequence_dict = {
    0 : ['Organism', 'Protein', 'Compound'],
    1 : ['Organism', 'Protein', 'EC', 'Reaction', 'Compound'],
    2 : ['Organism', 'Protein', 'EC', 'Reaction', 'Compound'],
    3 : ['Organism', 'Protein', 'Protein', 'EC', 'Reaction', 'Compound'],
    4 : ['Organism', 'Protein', 'Protein', 'EC', 'Reaction', 'Compound']
}
metapath_edge_sequence_dict = {
    0 : ['ENCODES_OeP', 'BINDS_CbP'],
    1 : ['Organism', 'Protein', 'EC', 'Reaction', 'Compound'],
    2 : ['Organism', 'Protein', 'EC', 'Reaction', 'Compound'],
    3 : ['Organism', 'Protein', 'Protein', 'EC', 'Reaction', 'Compound'],
    4 : ['Organism', 'Protein', 'Protein', 'EC', 'Reaction', 'Compound']
}

# import joblib
# joblib.dump(metapath_data_dict, os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/metapath_data_dict.joblib'))


In [370]:
%%time

# Compute edge-type specific node degree

node_edgetype_specific_degree = []
for i in range(len(metapath_data_dict)):
    item = metapath_data_dict[i]
    node_columns = [element for element in list(item.columns) if 'relation' not in element]
    relation_columns = [element for element in list(item.columns) if 'relation' in element]
    for index, relation in enumerate(relation_columns):
        rel_type = item[relation].unique()[0]
        rel_index_list = [index, index+1]
        for rel_index in rel_index_list:
            node_type = metapath_node_sequence_dict[i][rel_index]
            node_column = node_columns[rel_index]
            node_id_list = list(item[node_column].unique())            
            degree_query = '''
                MATCH(n1:{})-[r:{}]-(n2)
                WHERE n1.identifier IN {}
                RETURN n1.identifier AS n1_id, COUNT(r) AS degree
            '''.format(node_type, rel_type, node_id_list)                    
            auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
            sdb = GraphDatabase.driver(URI, auth=auth)
            with sdb.session() as session:
                with session.begin_transaction() as tx:
                    result = tx.run(degree_query)
                    node_degree = []
                    for row in result:
                        node_degree.append((row['n1_id'], rel_type, row['degree']))
            sdb.close()
            node_degree_df = pd.DataFrame(node_degree, columns=['node_id', 'relation_type', 'degree'])
            node_edgetype_specific_degree.append(node_degree_df)

node_edgetype_specific_degree = pd.concat(node_edgetype_specific_degree, ignore_index=True).drop_duplicates()


CPU times: user 7.3 s, sys: 953 ms, total: 8.25 s
Wall time: 1min 13s


In [387]:
# node_edgetype_specific_degree.to_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/node_edgetype_specific_degree.csv'))

# Compute damped degree
DAMPING_FACTOR = -0.4
node_edgetype_specific_degree.loc[:,'damped_degree'] = node_edgetype_specific_degree.degree**DAMPING_FACTOR
node_edgetype_specific_degree

Unnamed: 0,node_id,relation_type,degree,damped_degree
0,1766,ENCODES_OeP,27249,0.016821
1,53502,ENCODES_OeP,7,0.459157
2,436114,ENCODES_OeP,1712,0.050886
3,B2V8E3,ENCODES_OeP,1,1.000000
4,O52691,ENCODES_OeP,1,1.000000
...,...,...,...,...
327018,inchikey:IAKHMKGGTNLKSZ-INIZCTEOSA-N,PRODUCES_RpC,1,1.000000
327019,inchikey:OEYIOHPDSNJKLS-UHFFFAOYSA-N,PRODUCES_RpC,27,0.267581
327020,inchikey:QIVBCDIJIAJPQS-VIFPVBQESA-N,PRODUCES_RpC,11,0.383215
327021,inchikey:UEDUENGHJMELGK-HYDKPPNVSA-N,PRODUCES_RpC,2,0.757858


In [414]:
%%time

# Compute PDP

metapath_data_dict_with_degree = {}
for i in range(len(metapath_data_dict)):
    item = metapath_data_dict[i]
    node_columns = [element for element in list(item.columns) if 'relation' not in element]
    relation_columns = [element for element in list(item.columns) if 'relation' in element]
    df1 = item
    for rel_index, relation_column in enumerate(relation_columns):
        df1 = pd.merge(df1, node_edgetype_specific_degree, left_on=[node_columns[rel_index], relation_column] , right_on=['node_id','relation_type']).drop(['node_id', 'relation_type', 'degree'], axis=1)
        df1 = pd.merge(df1, node_edgetype_specific_degree, left_on=[node_columns[rel_index+1], relation_column] , right_on=['node_id','relation_type']).drop(['node_id', 'relation_type', 'degree'], axis=1)
    damped_degree_columns = [element for element in list(df1.columns) if 'damped_degree' in element]
    df1.loc[:,'pdp'] = df1[list(set(damped_degree_columns))].prod(axis=1)
    metapath_data_dict_with_degree[i] = df1
        





CPU times: user 3.61 s, sys: 536 ms, total: 4.15 s
Wall time: 4.24 s


In [445]:
dwpc = []
for i in range(len(metapath_data_dict)):
    item = metapath_data_dict[i]
    node_columns = [element for element in list(item.columns) if 'relation' not in element]
    dwpc_df = metapath_data_dict_with_degree[i].groupby([node_columns[0], node_columns[-1]]).agg({'pdp': 'sum'}).reset_index()
    dwpc_df.rename(columns={node_columns[0]:'source', node_columns[-1]:'target', 'pdp': 'dwpc'}, inplace=True)
    dwpc.append(dwpc_df)
    



In [520]:
org_df_selected_with_index = org_df_selected.reset_index().drop('index', axis=1).reset_index().rename(columns={'index':'org_index'})
cmp_df_selected_with_index = cmp_df_selected.reset_index().drop('index', axis=1).reset_index().rename(columns={'index':'cmp_index'})



In [538]:
dwpc_mat = np.zeros((org_df_selected.shape[0], cmp_df_selected.shape[0], len(metapath_data_dict)))
for i in range(len(metapath_data_dict)):
    dwpc_feature_df = dwpc[i]
    dwpc_feature_df = pd.merge(dwpc_feature_df, org_df_selected_with_index, left_on='source', right_on='spoke_id').drop('source', axis=1)
    dwpc_feature_df = pd.merge(dwpc_feature_df, cmp_df_selected_with_index, left_on='target', right_on='spoke_identifier').drop('target', axis=1)
    dwpc_arr_ind = dwpc_feature_df[['org_index', 'cmp_index']].to_numpy()
    dwpc_arr_ind = np.column_stack((dwpc_arr_ind, i*np.ones(dwpc_arr_ind.shape[0])))
    dwpc_arr_ind = dwpc_arr_ind.astype(int)
    dwpc_val = dwpc_feature_df['dwpc'].to_numpy()
    dwpc_mat[dwpc_arr_ind[:, 0], dwpc_arr_ind[:, 1], dwpc_arr_ind[:, 2]] = dwpc_val


In [557]:
dwpc_mat_2d = np.max(dwpc_mat, axis=2)


0.0011271774079332716

In [588]:
# np.save(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/dwpc_mat_3d.npy'), dwpc_mat, allow_pickle=False)
# np.save(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/dwpc_mat_2d.npy'), dwpc_mat_2d, allow_pickle=False)

cmp_df_selected_with_index.to_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/cmp_df_selected_with_index.csv'))
org_df_selected_with_index.to_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/org_df_selected_with_index.csv'))


In [559]:
sparsity_percentage = (np.count_nonzero(dwpc_mat_2d == 0) / dwpc_mat_2d.size) * 100
print(sparsity_percentage)

82.92777317063539


In [761]:
cmp_name = 'ursodiol'
column_ind = cmp_df_selected_with_index[cmp_df_selected_with_index.compound_name==cmp_name].cmp_index.values[0]
cmp_df_selected_with_index_copy = org_df_selected_with_index.copy()
cmp_df_selected_with_index_copy.loc[:,'dwpc'] = dwpc_mat_2d[:,column_ind]
cmp_df_selected_with_index_copy.sort_values(by='dwpc', ascending=False)


Unnamed: 0,org_index,spoke_id,spoke_name,dwpc
865,865,29369,Clostridium sardiniense,0.125283
3642,3642,29347,[Clostridium] scindens,0.033690
887,887,74426,Collinsella aerofaciens,0.003949
0,0,1817405,Abyssicoccus albus,0.000000
2451,2451,484770,Pelosinus sp. UFO1,0.000000
...,...,...,...,...
1227,1227,1260,Finegoldia magna,0.000000
1228,1228,1752063,Fischerella sp. NIES-3754,0.000000
1229,1229,373891,Flammeovirga kamogawensis,0.000000
1230,1230,2494373,Flammeovirga pectinis,0.000000


In [725]:
# org_cmp_metapath.to_csv(os.path.join(os.path.expanduser('~'), 'UCSF/bcmm_project/bcmm_data/org_cmp_manually_selected_metapath.csv'), index=False, header=True)

