In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from paths import *
from neo4j import GraphDatabase, basic_auth

In [2]:
def get_names_from_inchikeys(inchikeys):
    base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f"{base_url}/compound/inchikey/{','.join(inchikeys)}/property/IUPACName/JSON"
    response = requests.get(url)
    data = response.json()
    compound_names = []
    for record in data['PropertyTable']['Properties']:
        try:
            compound_names.append(record['IUPACName'])
        except:
            compound_names.append(None)
    return compound_names



In [3]:
filepath = "../spoke_35M_data/compound_degree.csv"

df = pd.read_csv(filepath)

In [4]:
df.node_id = df.node_id.apply(lambda x:x.split("Compound:")[-1])
df.shape

(562200, 2)

In [5]:
threshold = np.percentile(df['degree'], 99)

filtered_df = df[df['degree'] > threshold]
print("There are {} Compounds with degree greater than the selected threshold".format(filtered_df.shape[0]))
print("Minimum degree of that group = {}".format(filtered_df.degree.min()))
print("Maximum degree of that group = {}".format(filtered_df.degree.max()))

There are 5553 Compounds with degree greater than the selected threshold
Minimum degree of that group = 786
Maximum degree of that group = 494633


In [13]:
%%time

query = """
            MATCH(n:Compound)
            WHERE n.identifier in {}
            RETURN n.identifier AS n_id, n.name AS n_name
"""

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)
node_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query.format(list(filtered_df["node_id"].unique())))
        for row in result:
            node_list.append((row["n_id"], row["n_name"]))
sdb.close()


CPU times: user 216 ms, sys: 26.1 ms, total: 242 ms
Wall time: 1.44 s


In [15]:
node_df = pd.DataFrame(node_list, columns=["node_id", "node_name"])
filtered_df_with_name_and_degree = pd.merge(filtered_df, node_df, on="node_id")
filtered_df_with_name_and_degree.to_csv("../spoke_35M_data/compound_degree_greater_than_99_percentile.csv", index=False, header=True)


In [19]:
compounds_to_remove = ["Hydron", "Water", "Hydrogen", "Water O-15", "Oxygen"]

filtered_df_with_name_and_degree[filtered_df_with_name_and_degree.node_name.isin(compounds_to_remove)]


Unnamed: 0,node_id,degree,node_name
137,inchikey:MYMOFIZGZYHOMD-UHFFFAOYSA-N,64493,Oxygen
307,inchikey:XLYOFNOQVPJJNP-UHFFFAOYSA-N,267316,Water
3586,inchikey:UFHFLCQGNIYNRP-UHFFFAOYSA-N,262288,Hydrogen
3902,inchikey:GPRLSGONYQIRFK-UHFFFAOYSA-N,274946,Hydron
4107,inchikey:XLYOFNOQVPJJNP-BJUDXGSMSA-N,255874,Water O-15
