In [1]:
import pickle
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [2]:
def generate_questions(sel_disease, sel_predicate, questions_per_disease_per_predicate=10):
    sel_disease_id = diseases_selected[sel_disease]
    data_df_selected = data_df[(data_df.context.str.contains(sel_disease_id)) & (data_df.context.str.contains(sel_disease))]
    sel_index = data_df_selected.index.values[0]    
    data_edges = data[sel_index].split("\n")
    edges_selected = list(filter(None, (map(lambda x:x if " "+sel_predicate+" " in x else None, data_edges))))    
    questions_selected_ = list(map(lambda x:x.split(". Attr")[0], edges_selected))        
    predicate_list = []
    try:
        questions_selected = random.sample(questions_selected_, questions_per_disease_per_predicate)
    except:
        questions_selected = questions_selected_
    predicate_list = [sel_predicate]*len(questions_selected)
    questions_selected_transformed = list(map(lambda x:"".join([x.split(sel_predicate)[0], disease_predicates[sel_predicate], x.split(sel_predicate)[-1]]), questions_selected))
    return questions_selected_transformed, questions_selected, predicate_list
        



In [7]:
DATA_PATH = "../../../data/disease_context_from_dev.pickle"

QUESTIONS_PER_DISEASE_PER_PREDICATE = 15
VALIDATION_SIZE = 1000


In [8]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)
    

In [9]:
diseases_selected = {
    "lung cancer" : "DOID:1324",
    "breast cancer" : "DOID:1612",
    "prostate cancer" : "DOID:10283",
    "lymphoma" : "DOID:0060058",
    "pancreatic cancer" : "DOID:1793",
    "COVID-19" : "DOID:0080600",
    "hepatitis C" : "DOID:1883",
    "hepatitis B" : "DOID:2043",
    "pneumonia" : "DOID:552",
    "tuberculosis" : "DOID:399",
    "type 2 diabetes mellitus": "DOID:9352",
    "hypertension" : "DOID:10763",
    "Addison's disease" : "DOID:13774",
    "cardiac arrest" : "DOID:0060319",
    "renal fibrosis" : "DOID:0050855",
    "Huntington's disease" : "DOID:12858",
    "cystic fibrosis" : "DOID:1485",
    "sickle cell anemia" : "DOID:10923",
    "phenylketonuria" : "DOID:9281",
    "Tay-Sachs disease" : "DOID:3320",
    "scurvy" : "DOID:13724",
    "osteoporosis" : "DOID:11476",
    "rickets" : "DOID:10609",
    "anemia" : "DOID:2355",
    "riboflavin deficiency" : "DOID:8454"
}




In [10]:
disease_predicates = {
    "ISA" : "is a",
    "LOCALIZES" : "LOCALIZES".lower(), 
    "PRESENTS" : "PRESENTS".lower(),
    "RESEMBLES" : "RESEMBLES".lower(),
    "ASSOCIATES" : "ASSOCIATES".lower(),
    "TREATS" : "TREATS".lower(),
    "CONTRAINDICATES" : "CONTRAINDICATES".lower(),
    "CAUSES" : "CAUSES".lower(),
    "EXPRESSEDIN" : "is expressed in",
    "PREVALENCE" : "PREVALENCE".lower(),
    "INCREASEDIN" : "is increased in",
    "DECREASEDIN" : "is decreased in"
}


In [11]:
data_df = pd.DataFrame(data, columns=["context"])


In [305]:
%%time

generated_questions_transformed = []
generated_questions = []
predicate_list = []
for sel_disease in tqdm(diseases_selected):
    for sel_predicate in disease_predicates:
        questions_selected_transformed, questions_selected, predicates = generate_questions(sel_disease, 
                                                                                            sel_predicate, 
                                                                                            questions_per_disease_per_predicate=QUESTIONS_PER_DISEASE_PER_PREDICATE
                                                                                        )
        generated_questions_transformed.extend(questions_selected_transformed)
        generated_questions.extend(questions_selected)
        predicate_list.extend(predicates)
        
        

100%|███████████████████████████████████████████| 25/25 [01:24<00:00,  3.39s/it]

CPU times: user 1min 24s, sys: 191 ms, total: 1min 24s
Wall time: 1min 24s





In [306]:
generated_questions[25]

'lung cancer ASSOCIATES Gene SNAI1'

In [307]:
from collections import Counter

count_dict = dict(Counter(predicate_list))
count_dict

{'ISA': 126,
 'PRESENTS': 323,
 'ASSOCIATES': 362,
 'EXPRESSEDIN': 60,
 'LOCALIZES': 317,
 'RESEMBLES': 157,
 'TREATS': 236,
 'CONTRAINDICATES': 121,
 'CAUSES': 19,
 'INCREASEDIN': 15,
 'DECREASEDIN': 1,
 'PREVALENCE': 15}

In [327]:
# From the above count dictionary, it is evident that "DECREASEDIN" edge has only 1 count. Hence, removing that edge to obvisate issues while splitting the questions into validation and testing in a stratified fashion

generated_questions_df = pd.DataFrame(generated_questions, columns=["questions"])
generated_questions_df = generated_questions_df[~generated_questions_df.questions.str.contains("DECREASEDIN")]

generated_questions_transformed_df = pd.DataFrame(generated_questions_transformed, columns=["questions"])
generated_questions_transformed_df = generated_questions_transformed_df[~generated_questions_transformed_df.questions.str.contains("is decreased in")]

predicate_list_df = pd.DataFrame(predicate_list, columns=["predicates"])
predicate_list_df = predicate_list_df[predicate_list_df.predicates != "DECREASEDIN"]

generated_questions_transformed_df.loc[:, "questions_as_in_database"] = generated_questions_df["questions"]
generated_questions_transformed_df.loc[:, "predicate"] = predicate_list_df["predicates"]




In [328]:
generated_questions_transformed_df[generated_questions_transformed_df.questions.str.contains("lung cancer")]

# generated_questions_transformed_df


Unnamed: 0,questions,questions_as_in_database,predicate
0,Disease main bronchus cancer is a lung cancer,Disease main bronchus cancer ISA lung cancer,ISA
1,Disease lung adenocarcinoma is a lung cancer,Disease lung adenocarcinoma ISA lung cancer,ISA
2,Disease pulmonary blastoma is a lung cancer,Disease pulmonary blastoma ISA lung cancer,ISA
3,Disease epithelioid trophoblastic tumor is a l...,Disease epithelioid trophoblastic tumor ISA lu...,ISA
4,Disease Pancoast tumor is a lung cancer,Disease Pancoast tumor ISA lung cancer,ISA
5,Disease lung pleomorphic carcinoma is a lung c...,Disease lung pleomorphic carcinoma ISA lung ca...,ISA
6,Disease pulmonary neuroendocrine tumor is a lu...,Disease pulmonary neuroendocrine tumor ISA lun...,ISA
7,lung cancer is a Disease lung disease,lung cancer ISA Disease lung disease,ISA
8,Disease lung lymphoma is a lung cancer,Disease lung lymphoma ISA lung cancer,ISA
9,Disease lung meningioma is a lung cancer,Disease lung meningioma ISA lung cancer,ISA


In [329]:
random_seed = 42
validation_questions, test_questions = train_test_split(generated_questions_transformed_df, 
                                                       test_size = len(generated_questions_df)-VALIDATION_SIZE, 
                                                       stratify = predicate_list_df.predicates.values,
                                                       random_state=random_seed)


In [334]:
validation_questions

Unnamed: 0,questions,questions_as_in_database,predicate
345,pancreatic cancer localizes Anatomy common hep...,pancreatic cancer LOCALIZES Anatomy common hep...,LOCALIZES
1035,cardiac arrest presents Symptom Hyperventilation,cardiac arrest PRESENTS Symptom Hyperventilation,PRESENTS
1633,anemia localizes Anatomy colostrum,anemia LOCALIZES Anatomy colostrum,LOCALIZES
205,Compound Rosuvastatin treats prostate cancer,Compound Rosuvastatin TREATS prostate cancer,TREATS
169,prostate cancer localizes Anatomy epithelium,prostate cancer LOCALIZES Anatomy epithelium,LOCALIZES
...,...,...,...
446,COVID-19 presents Symptom Halitosis,COVID-19 PRESENTS Symptom Halitosis,PRESENTS
1116,"Huntington's disease presents Symptom Apraxia,...","Huntington's disease PRESENTS Symptom Apraxia,...",PRESENTS
1145,Compound 3-(4-Amino-2-methyl-pyrimidin-5-ylmet...,Compound 3-(4-Amino-2-methyl-pyrimidin-5-ylmet...,TREATS
774,type 2 diabetes mellitus localizes Anatomy ret...,type 2 diabetes mellitus LOCALIZES Anatomy ret...,LOCALIZES


In [335]:
validation_questions.to_csv("../data/validation_questions_for_retrieval_performance.csv", index=False, header=True)
test_questions.to_csv("../data/test_questions_for_retrieval_performance.csv", index=False, header=True)


In [154]:
sel_disease = "Addison's disease"
sel_predicate = "ASSOCIATES"

sel_disease_id = diseases_selected[sel_disease]
data_df_selected = data_df[(data_df.context.str.contains(sel_disease_id)) & (data_df.context.str.contains(sel_disease))]
data_df_selected

sel_index = data_df_selected.index.values[0]

data_edges = data[sel_index].split("\n")
edges_selected = list(filter(None, (map(lambda x:x if sel_predicate in x else None, data_edges))))

questions_selected = list(map(lambda x:x.split(". Attr")[0], edges_selected))

# random.sample(questions_selected, QUESTIONS_PER_DISEASE_PER_PREDICATE)

In [158]:
list(map(lambda x:"".join([x.split(sel_predicate)[0], sel_predicate.lower(), x.split(sel_predicate)[-1]]), questions_selected))


["Addison's disease associates Gene NUDT10",
 "Addison's disease associates Gene SCNN1G",
 "Addison's disease associates Gene SCNN1A",
 "Addison's disease associates Gene SERPINA6",
 "Addison's disease associates Gene POMC",
 "Addison's disease associates Gene AIRE",
 "Addison's disease associates Gene CYP11B1",
 "Addison's disease associates Gene SUFU",
 "Addison's disease associates Gene CYP11B2",
 "Addison's disease associates Gene GLI3",
 "Addison's disease associates Gene TRAF7",
 "Addison's disease associates Gene TERT",
 "Addison's disease associates Gene PEX6",
 "Addison's disease associates Gene CYP11A1",
 "Addison's disease associates Gene PEX26",
 "Addison's disease associates Gene PEX14",
 "Addison's disease associates Gene PEX12",
 "Addison's disease associates Gene PEX3",
 "Addison's disease associates Gene TBX19",
 "Addison's disease associates Gene PIK3CA",
 "Addison's disease associates Gene PEX16",
 "Addison's disease associates Gene CYP17A1",
 "Addison's disease asso

['osteoporosis ISA Disease bone resorption disease. Attributes of this relationship are:',
 'Disease idiopathic juvenile osteoporosis ISA osteoporosis. Attributes of this relationship are:',
 'Disease glucocorticoid-induced osteoporosis ISA osteoporosis. Attributes of this relationship are:']