In [18]:
# get all drugs
import pandas as pd
import re

drug_file = pd.read_csv("drugs.csv")
drugs = drug_file["name"]

pattern = "^[a-zA-Z0-9-]+$"
drug_list = [drug.lower() for drug in drugs if re.match(pattern, drug)]
        
drug_dict = {"drug names": drug_list}
df = pd.DataFrame(drug_dict)
df.to_csv("drug_all.csv", index = False)

In [22]:
# get all genes

gene_file = pd.read_csv("genes.tsv", sep = "\t")
genes = list(gene_file["Symbol"])

pattern = "^[a-zA-Z0-9-]+$"
gene_list = [gene.lower() for gene in genes if re.match(pattern, gene)]

gene_dict = {"gene names": gene_list}
df = pd.DataFrame(gene_dict)
df.to_csv("gene_all.csv", index = False)

In [12]:
# preprocess the abstract

from nltk import tokenize

drug_file = pd.read_csv("drug_all.csv")
drug_list = drug_file["drug names"]
gene_file = pd.read_csv("gene_all.csv")
gene_list = gene_file["gene names"]

drugs = set(drug_list)
genes = set(gene_list)
# remove words that are widely used and tend not to be truly drugs
common_words = set(["was", "set", "rest", "max", "son", "bad", "she", "met", "cat", "hr", 
                    "kin", "arc", "sell", "camp", "palm", "kit", "fry", "bid", "clock", "mice"])
genes = genes.difference(common_words)

trial = pd.read_csv("abstract.csv")

valid_sentences = [] # store all sentences with drugs & genes
drug_name = []
gene_name = []

for i in range(trial.size):
    sentences = trial.iloc[i]
    sentences = sentences.replace("\n", "")
    sentence_lst = tokenize.sent_tokenize(sentences)
    for sentence in sentence_lst:
        sentence = sentence.replace(".", "")
        sentence = sentence + "."
        drug_found = []
        gene_found = []
        
        words = tokenize.word_tokenize(sentence)
        for word in words:
            if word.lower() in drugs:
                drug_found.append(word)
            elif word.lower() in genes and word[0].isupper():
                gene_found.append(word)
        if len(drug_found) == 1 and len(gene_found) == 1:
            valid_sentences.append(sentence)
            drug_name.append(drug_found[0])
            gene_name.append(gene_found[0])
        

sentence_dict = {"Sentence": valid_sentences, "Drug": drug_name, "Gene": gene_name}
df = pd.DataFrame(sentence_dict)
df.to_csv("valid_sentence_all.csv", index = False)


In [13]:
# dependency parsing

import os
from nltk.parse import stanford
import networkx

# this path should be different on different machines
java_path = '../../Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home/bin/java.exe'
os.environ['JAVAHOME'] = java_path
os.environ['STANFORD_PARSER'] = r'stanford-parser-full-2014-10-31/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = r'stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar'
dependency_parser = stanford.StanfordDependencyParser(path_to_jar='stanford-parser-full-2014-10-31/stanford-parser.jar', path_to_models_jar='stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar')

def extract_path(start, end, sentence):
    res = []

    result = dependency_parser.raw_parse(sentence)
    dep = next(result)

    edges = []
    relations = []
    for triple in dep.triples():
        # taking the head and the dependent from tuple and making a networkx graph, 
        # to find the shortest path connecting start word (drug) to end word (gene)
        word1, word2, relation = triple[0][0], triple[2][0], triple[1]
        relations.append((word1, word2, relation))
        dp = word1, word2
        edges.append(dp)

    graph = networkx.Graph(edges)
    path = networkx.shortest_path(graph, start, end)
    
    path.reverse()
    # for word in path:
    for i in range(len(path) - 1):
        word = path[i]
        another = path[i + 1]
        for relation in relations:
            found = False
            dep = relation[2]
            if (word == relation[1] and another  == relation[0]) or (
                word == relation[0] and another  == relation[1]):
                if dep == "prep" and word == relation[1]: # given a prep
                    prior = res.pop()
                    res.append("prep_" + prior)
                    res.append(another)
                elif dep == "prep" and word == relation[0]: # given is not prep
                    res.append("prep_" + another)
                elif dep == "pobj": # arise because of preposition
                    res.append(another)
                else:
                    res.append(dep)
                    res.append(another)
                found = True
            if found: 
                break
    
    res.pop()
    res.reverse()
    return(res)


parsing_file = pd.read_csv("valid_sentence_all.csv")
lists = parsing_file.loc[:, "Sentence"]
drugs = parsing_file.loc[:, "Drug"]
genes = parsing_file.loc[:, "Gene"]

relation = []
for i in range(len(lists)):
    relation.append(extract_path(drugs[i], genes[i], lists[i]))
    
df = pd.DataFrame({"Sentence": parsing_file.loc[:, "Sentence"], 
                   "Drug": drugs, 
                   "Gene": genes, 
                   "Relation": relation})
df.to_csv("valid_sentence_final.csv", index = False)


Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.
  if sys.path[0] == '':


In [None]:
# This creates a matrix for drug-gene relationships and the paths

drug_gene = []
relations = []
for i in range(len(relation)):
    the_relation = relation[i]
    if len(the_relation) > 3: # remove the relationship if the two words are connected by 0/1 word
        relations.append(str(the_relation))
        drug_gene.append(drugs[i].lower() + "_" + genes[i])
    
new_df = pd.DataFrame({"Drug_Gene": drug_gene, 
                       "Relation": relations})

my_crosstab = pd.crosstab(index = new_df["Drug_Gene"], 
                          columns = new_df["Relation"],
                          margins = False) # do not include row and column totals

my_crosstab.to_csv("matrix.csv", index = True)


In [1]:
# The part below is only for comparing the parser result with the original data in paper

import os
from nltk.parse import stanford
import networkx

java_path = '../../Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home/bin/java.exe'
os.environ['JAVAHOME'] = java_path
os.environ['STANFORD_PARSER'] = r'stanford-parser-full-2014-10-31/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = r'stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar'
dependency_parser = stanford.StanfordDependencyParser(path_to_jar='stanford-parser-full-2014-10-31/stanford-parser.jar', path_to_models_jar='stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar')


def extract_path_dbg(start, end, sentence):
    print("START from {start} and end from {end}".format(start = start, end = end))
    res = []

    result = dependency_parser.raw_parse(sentence)
    dep = next(result)

    edges = []
    relations = []
    for triple in dep.triples():
        #taking the head and the dependent from tuple and making a networkx graph
        word1, word2, relation = triple[0][0], triple[2][0], triple[1]
        relations.append((word1, word2, relation))
        dp = word1, word2
        edges.append(dp)

    graph = networkx.Graph(edges)
    path = networkx.shortest_path(graph, start, end)
    print("PATH: ")
    print(path, "\n")
    print("ALL RELATIONS: ")
    print(relations, "\n")
    
    path.reverse()
    # for word in path:
    for i in range(len(path) - 1):
        word = path[i]
        another = path[i + 1]
        for relation in relations:
            found = False
            dep = relation[2]
            if (word == relation[1] and another  == relation[0]) or (
                word == relation[0] and another  == relation[1]):
                if dep == "prep" and word == relation[1]: # given a prep
                    prior = res.pop()
                    res.append("prep_" + prior)
                    res.append(another)
                elif dep == "prep" and word == relation[0]: # given is not prep
                    res.append("prep_" + another)
                elif dep == "pobj":
                    res.append(another)
                else:
                    res.append(dep)
                    res.append(another)
                found = True
            if found: 
                break
    
    res.pop()
    res.reverse()
    return(res)

trial_sentences = []
trial_sentences.append("CYP3A4 mRNA expression was significantly increased by rifampicin exposure in human hepatocytes.")
trial_sentences.append("Geldanamycin (GA), an HSP90 inhibitor, is able to suppress 1,25-induced differentiation of HL60 cells.")
trial_sentences.append("Amodiaquine is mainly metabolized hepatically towards its major active metabolite desethylamodiaquine, by the polymorphic P450 isoform CYP2C8.")
trial_sentences.append("These results suggest that TRPV2 is specifically activated by probenecid and that this chemical might be useful for investigation of pain-related TRPV2 function.")
trial_sentences.append("The results of preclinical studies demonstrated that CYP3A4 is involved in the metabolism of gefitinib and that gefitinib is a weak inhibitor of CYP2D6 activity. ")

tmp_drugs = ["rifampicin", "Geldanamycin", "Amodiaquine", "probenecid", "gefitinib"]
tmp_genes = ["CYP3A4", "HSP90", "CYP2C8", "TRPV2", "CYP3A4"]


references = []
references.append("['nn', 'exposure', 'agent', 'increased', 'nsubjpass', 'expression', 'amod']")
references.append("['appos', 'inhibitor', 'amod']")
references.append("['nsubjpass', 'metabolized', 'agent']")
references.append("['agent', 'activated', 'nsubjpass']")
references.append("['prep_of', 'metabolism', 'prep_in', 'involved', 'nsubjpass']")

for i in range(len(trial_sentences)):
    print(i)
    print(extract_path_dbg(tmp_drugs[i], tmp_genes[i], trial_sentences[i]))
    print("Reference: \n{ref}".format(ref = references[i]))
    print("")



Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.
  if __name__ == '__main__':


0
START from rifampicin and end from CYP3A4
PATH: 
['rifampicin', 'exposure', 'by', 'increased', 'expression', 'CYP3A4'] 

ALL RELATIONS: 
[('increased', 'expression', 'nsubjpass'), ('expression', 'CYP3A4', 'num'), ('expression', 'mRNA', 'nn'), ('increased', 'was', 'auxpass'), ('increased', 'significantly', 'advmod'), ('increased', 'by', 'prep'), ('by', 'exposure', 'pobj'), ('exposure', 'rifampicin', 'nn'), ('exposure', 'in', 'prep'), ('in', 'hepatocytes', 'pobj'), ('hepatocytes', 'human', 'amod')] 

['nn', 'exposure', 'prep_by', 'increased', 'nsubjpass', 'expression', 'num']
Reference: 
['nn', 'exposure', 'agent', 'increased', 'nsubjpass', 'expression', 'amod']

1
START from Geldanamycin and end from HSP90
PATH: 
['Geldanamycin', 'inhibitor', 'HSP90'] 

ALL RELATIONS: 
[('able', 'Geldanamycin', 'nsubj'), ('Geldanamycin', 'GA', 'appos'), ('Geldanamycin', 'inhibitor', 'appos'), ('inhibitor', 'an', 'det'), ('inhibitor', 'HSP90', 'nn'), ('able', 'is', 'cop'), ('able', 'suppress', 'xcomp')