<a href="https://colab.research.google.com/github/lbdlab/NGS-LBD/blob/master/notebooks/LBD_Bilirubin_Modulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LBD - Bilirubin Modulation**

In [None]:
!pip install neo4j pandas seaborn matplotlib biopython py2neo

Collecting py2neo
[?25l  Downloading https://files.pythonhosted.org/packages/ab/66/398d7b1e612b6bb0977f4be90dab246f1140549f3c24fc5da539d6b2282a/py2neo-2021.1.5-py2.py3-none-any.whl (204kB)
[K     |█▋                              | 10kB 15.6MB/s eta 0:00:01[K     |███▏                            | 20kB 20.8MB/s eta 0:00:01[K     |████▉                           | 30kB 23.1MB/s eta 0:00:01[K     |██████▍                         | 40kB 25.3MB/s eta 0:00:01[K     |████████                        | 51kB 27.0MB/s eta 0:00:01[K     |█████████▋                      | 61kB 28.2MB/s eta 0:00:01[K     |███████████▏                    | 71kB 29.4MB/s eta 0:00:01[K     |████████████▉                   | 81kB 29.2MB/s eta 0:00:01[K     |██████████████▍                 | 92kB 25.8MB/s eta 0:00:01[K     |████████████████                | 102kB 25.7MB/s eta 0:00:01[K     |█████████████████▋              | 112kB 25.7MB/s eta 0:00:01[K     |███████████████████▏            | 122kB

In [None]:
# Libraries
%matplotlib inline
from datetime import datetime
import pandas as pd
# from neo4j import GraphDatabase
from py2neo import Graph



In [None]:
# Search in PubMed

from Bio import Entrez

Entrez.email = 'ziberna@gmail.com'


In [None]:
# # Connect to Neo4j
# database_name = 'semmed4321'
# driver = GraphDatabase.driver('neo4j://212.235.239.171:7687', auth=('neo4j', 'semMedDb2020'))

# def read_query(query, params={}):
#     with driver.session(database=database_name) as session:
#         result = session.run(query, params)
#         return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [None]:
# Custom functions

# Connect to graph function
def get_graph_instance():
    return Graph("bolt://212.235.239.171:7687", user='neo4j', password='semMedDb2020', name='semmed4321')

# Create PubMed link
def generate_pubmed_url(*, pmid: int) -> str:
    """Generate PubMed URL from PMID.
    
    :param pmid: PubMed ID number of an article
    :return: URL string
    """
    pubmed_stem_url = 'https://pubmed.ncbi.nlm.nih.gov/'
    pubmed_url = pubmed_stem_url + str(pmid)
    return pubmed_url

# Create dataframe and html file from output
def create_df_html_from_query_output(*, output: list, html_file_name: str) -> pd.DataFrame:
    """
    Create dataframe and save html file from Neo4J query results data.
    
    :param output: data output of a specific Neo4J query
    :param html_file_name: filename of the saved html file
    :return: dataframe of the output
    """
    # Create dataframe from output
    df = pd.DataFrame(output)

    # Search for a specific PMID
    all_pmid_list = df['c_xy.pmid'].to_list()

    handle = Entrez.efetch(db='pubmed', id=all_pmid_list, retmode="xml")
    record = Entrez.read(handle)

    # Extract Journal data
    pmid_dict = {}

    for article in record['PubmedArticle']:
        # Extract data
        try:
            pmid = str(article['MedlineCitation']['PMID'])
            journal_title = article['MedlineCitation']['Article']['Journal']['Title']
            journal_title_abbreviation = article['MedlineCitation']['Article']['Journal']['ISOAbbreviation']
            journal_issn = str(article['MedlineCitation']['Article']['Journal']['ISSN'])
        except:
            print('Not found PubMed data for PMID: ' + str(article['MedlineCitation']['PMID']))
            pmid = journal_title = journal_title_abbreviation = journal_issn = 'Not found'

        # Add to dict
        pmid_dict[pmid] = {'pmid': pmid,
                           'journal_title': journal_title,
                           'journal_title_abbreviation': journal_title_abbreviation,
                           'journal_issn': journal_issn}

    # Add journal data to dataframe
    df['journal_title'] = df['c_xy.pmid'].apply(
        lambda x: add_values_from_dict(input_dict=pmid_dict, key1=x, key2='journal_title'))
    df['journal_issn'] = df['c_xy.pmid'].apply(
        lambda x: add_values_from_dict(input_dict=pmid_dict, key1=x, key2='journal_issn'))
    

    # Add PubMed URL
    df['PubMed_URL'] = df['c_xy.pmid'].apply(lambda x: generate_pubmed_url(pmid=x))
    
    # Save dataframe to html
    with open(html_file_name, 'w') as html_file:
        html_string = df.to_html(render_links=True, notebook=False, show_dimensions=True, justify='center')
        html_file.write(html_string)
    
    # Return generated dataframe
    return df
    

def add_values_from_dict(*, input_dict: dict, key1: str, key2: str) -> str:
    """
    Extract data from custom two level dictionary.
    """
    try:
        out = input_dict[key1][key2]
    except Exception as e:
        print('Error in extracting data from dict.')
        print(repr(e))
        out = 'Not found'
    
    return out





---
## Heme Oxygenase-1

### Different names

Heme Oxygenase-1 (C0538674)
HMOX1 protein, human (C1565861)

### Partially connected
Heme Oxygenase (Decyclizing) (C0018969)
Hmox1 protein, mouse (C1565862)
Hmox1 protein, rat (C1453914)

heme oxygenase-2 (C0537535)
heme oxygenase-3 protein, human (C1435054)

### SemBT relations
http://sembt.mf.uni-lj.si/user_guide/SemBT_relation_types_and_instances_counts.html

AFFECTS	1008068	2124063
INTERACTS_WITH	956926	1824826

ASSOCIATED_WITH	544318	1316494
STIMULATES	442904	845725
INHIBITS	424125	749490
AUGMENTS




In [None]:
query = """
MATCH (x:Concept)-[r_xy]-(y:Concept)
WHERE 
(x:phsu OR x:aapp OR x:chem OR x:orch OR x:inch OR x:sbst OR x:clnd) AND
// (x:phsu OR x:aapp) AND
(r_xy:AFFECTS OR r_xy:NEG_AFFECTS OR 
r_xy:ASSOCIATED_WITH OR r_xy:NEG_ASSOCIATED_WITH OR 
r_xy:STIMULATES OR r_xy:NEG_STIMULATES OR 
r_xy:INHIBITS OR r_xy:NEG_INHIBITS OR 
r_xy:INTERACTS_WITH OR r_xy:NEG_INTERACTS_WITH OR 
r_xy:AUGMENTS OR r_xy:NEG_AUGMENTS) AND
y.cui in ['C0538674', 'C1565861']

WITH x, r_xy, y ORDER BY r_xy.freq DESC LIMIT 100

CALL {
    WITH x, r_xy, y
    MATCH (c_xy)<-[r_in_xy:IS_IN]-(s_xy)<-[r_extr_xy:Extracted_From]-(i_xy:Instance {predicate: type(r_xy)})-[r_sub_x:Inst_Subject]->(x),
      (i_xy)-[r_obj_y:Inst_Object]->(y)
    RETURN c_xy, s_xy, i_xy LIMIT 10
}

RETURN DISTINCT x.name,
type(r_xy), 
y.name,
r_xy.freq, r_xy.min_pyear,
c_xy.pmid, c_xy.pyear, c_xy.issn,
i_xy.predicate, i_xy.indicator_type,
s_xy.normalized_section_header, s_xy.sentence

"""

In [None]:
# Query
start_time = datetime.now()

# Run query
graph = get_graph_instance()
output = graph.run(query).data()

end_time = datetime.now()

# Time needed
print('Time needed: ' + str(end_time - start_time))

# Output size
print('Output rows: ' + str(len(output)))


In [None]:
# Create dataframe and HTML file
hox1_df = create_df_html_from_query_output(output=output, html_file_name='outputs/heme_oxygenase-1.html')


In [None]:
hox1_df

In [None]:
# Unique compunds
hox1_df['x.name'].value_counts().iloc[0:10]
