In [None]:
!ln -s /user/data/courses_data/clinical_trials proj1_dataset


In [4]:
import xml.etree.ElementTree as ET
import trec
import pandas as pd
import numpy as np
import pprint as pp
import pickle
from Query import Query


Queries = "./topics-2014_2015-summary.topics"
Qrels = "./qrels-clinical_trials.txt"

with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

#Dictionary with all queries, pairs (query_num,query)
cases = {}
queryLen = {}

#Read of all queries to dictionary queries
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = Query(q_num, q_title)
    queryLen[q_num] = len(q_title)

eval = trec.TrecEvaluation(cases, Qrels)

#Store the cases in a file
pickle.dump( cases, open( "./cases.p", "wb" ) )
pickle.dump( eval, open( "./eval.p", "wb" ) )
pickle.dump( queryLen, open( "./queriesLen.p", "wb" ) )
# pp.pprint(cases)

In [5]:
import xml.etree.ElementTree as ET
import tarfile
import os
import pickle
import numpy as np
from ClinicalTrial import ClinicalTrial
from Query import Query

tar = tarfile.open("../clinicaltrials.gov-16_dec_2015.tgz", "r:gz")

#Dictionary with all clinical trials, pairs (clinical_trial_id,clinical trial)
clinical_trials = {}


#Read of all clinical trials to dictionary clinical_trials
for tarinfo in tar:
    if tarinfo.size > 500:

        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True

        if not judged:
            continue

        clinical_trial = {}

        for nct_id in root.iter('nct_id'):
            clinical_trial["nct_id"] = nct_id.text

        clinical_trial["brief_title"] = ""
        for brief_title in root.iter('brief_title'):
            clinical_trial["brief_title"] = brief_title.text
    
        description = ""
        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                description+=child.text
        clinical_trial["detailed_description"] = description

        summary = ""
        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                summary+=child.text
        clinical_trial["brief_summary"] = summary.strip()

        clinical_trial["inclusion_criteria"] = ""
        clinical_trial["exclusion_criteria"] = ""
        for criteria in root.iter('criteria'):
            for child in criteria:
                text = child.text.strip()
                text = ''.join(text.splitlines())
                if "Inclusion Criteria" and "Exclusion Criteria" in text:
                    part1 = text.split("Inclusion Criteria")
                    part2 = part1[-1].split("Exclusion Criteria")
                    clinical_trial["inclusion_criteria"] = part2[0]
                    clinical_trial["exclusion_criteria"] = part2[-1]
                elif "Inclusion Criteria"in text and "Exclusion Criteria" not in text:
                    part1 = text.split("Inclusion Criteria")
                    clinical_trial["inclusion_criteria"] = part1[-1]
                elif "Inclusion Criteria" not in text and "Exclusion Criteria" in text:
                    part1 = text.split("Exclusion Criteria")
                    clinical_trial["exclusion_criteria"] = part1[-1]
       

        clinical_trial["phase"] = ""
        for phase in root.iter('phase'):
            clinical_trial["phase"] = phase.text

        clinical_trial["study_type"] = ""
        for study_type in root.iter('study_type'):
            clinical_trial["study_type"] = study_type.text
        
        clinical_trial["study_design"] = ""
        for study_design in root.iter('study_design'):
            clinical_trial["study_design"] = study_design.text
            
        clinical_trial["condition"]= ""
        for condition in root.iter('condition'):
            clinical_trial["condition"] = condition.text
            
        clinical_trial["intervention_name"] = ""
        clinical_trial["intervention_type"] = ""
        for intervention in root.iter('intervention'):
            for child in intervention:
                clinical_trial[child.tag] = child.text.strip()

        clinical_trial["gender"]= ""
        for gender in root.iter('gender'):
            clinical_trial["gender"] = gender.text
        
        clinical_trial["minimum_age"]= ""
        for minimum_age in root.iter('minimum_age'):
            clinical_trial["minimum_age"] = minimum_age.text
    
        clinical_trial["maximum_age"]= ""
        for maximum_age in root.iter('maximum_age'):
            clinical_trial["maximum_age"] = maximum_age.text

        """ clinical_trial["gender"] += " " + clinical_trial["minimum_age"] + " " + clinical_trial["maximum_age"]
        print(clinical_trial["gender"]) """

        clinical_trial["healthy_volunteers"] = ""
        for healthy_volunteers in root.iter('healthy_volunteers'):
            clinical_trial["healthy_volunteers"] = healthy_volunteers.text
        
        clinical_trial["mesh_term"] = ""
        for mesh_term in root.iter('mesh_term'):
            clinical_trial["mesh_term"] = mesh_term.text.strip()

        #Criar a clinical trial
        ct = ClinicalTrial(clinical_trial["nct_id"], clinical_trial["brief_title"],clinical_trial["detailed_description"], clinical_trial["brief_summary"], 
        clinical_trial["inclusion_criteria"], clinical_trial["exclusion_criteria"], clinical_trial["phase"],clinical_trial["study_type"], clinical_trial["study_design"], 
        clinical_trial["condition"], clinical_trial["intervention_name"], clinical_trial["intervention_type"], clinical_trial["gender"], clinical_trial["minimum_age"], 
        clinical_trial["maximum_age"], clinical_trial["healthy_volunteers"], clinical_trial["mesh_term"])

        #Adicionar a lista de trials
        clinical_trials[ct.nct_id] = ct

        
tar.close()

#Store the clinical trials in a file
pickle.dump( clinical_trials, open( "./clinical_trials.p", "wb" ) )

In [2]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import random
import xml.etree.ElementTree as ET
import tarfile
import os
import pickle
import numpy as np
from ClinicalTrial import ClinicalTrial
from Query import Query
import xml.etree.ElementTree as ET
import trec
import pandas as pd
import pprint as pp

#Load data
clinical_trials = pickle.load( open( "./clinical_trials.p", "rb" ) )
cases = pickle.load( open( "./cases.p", "rb" ) )
eval = pickle.load( open( "./eval.p", "rb" ) )

""" corpus = {'brief_title' : [], 'detailed_description' : [], 'brief_summary' : [], 'inclusion_criteria' : [],
 'exclusion_criteria' : [], 'phase' : [], 'study_type' : [], 'study_design' : [], 'condition' : [],
 'intervention_name' : [], 'intervention_type' : [], 'gender' : [], 'minimum_age' : [], 'maximum_age' : [],
 'healthy_volunteers' : [], 'mesh_term' : []} """

corpus = {'brief_title' : [], 'detailed_description' : [], 'brief_summary' : [], 'inclusion_criteria' : [],
 'exclusion_criteria' : []}

for ct in clinical_trials.values():
        corpus['brief_title'].append(ct.brief_title)
        corpus['detailed_description'].append(ct.detailed_description)
        corpus['brief_summary'].append(ct.brief_summary) 
        corpus['inclusion_criteria'].append(ct.inclusion_criteria) 
        corpus['exclusion_criteria'].append(ct.exclusion_criteria) 
        """ corpus['gender'].append(ct.gender) 
        corpus['phase'].append(ct.phase) 
        corpus['study_type'].append(ct.study_type) 
        corpus['study_design'].append(ct.study_design) 
        corpus['condition'].append(ct.condition) 
        corpus['intervention_name'].append(ct.intervention_name) 
        corpus['intervention_type'].append(ct.intervention_type)  
        corpus['minimum_age'].append(ct.minimum_age) 
        corpus['maximum_age'].append(ct.maximum_age) 
        corpus['healthy_volunteers'].append(ct.healthy_volunteers) 
        corpus['mesh_term'].append(ct.mesh_term) """

pickle.dump( corpus, open( "./corpus.p", "wb" ) )

In [4]:
q_revDocs = {case: [] for case in cases.keys()}

for i, rel in enumerate(eval.relevance_judgments['rel']):
    query = eval.relevance_judgments['query_id'][i]
    doc = eval.relevance_judgments['docid'][i]
    if rel >= 1:
        q_revDocs[str(query)].append(str(doc))

pickle.dump( q_revDocs, open( "./q_revDocs.p", "wb" ) )

{'20141': ['NCT00000492', 'NCT00005127', 'NCT00005485', 'NCT00005757', 'NCT00129233', 'NCT00141583', 'NCT00143195', 'NCT00149227', 'NCT00162344', 'NCT00175279', 'NCT00225355', 'NCT00373828', 'NCT00479908', 'NCT00683813', 'NCT00797953', 'NCT00802230', 'NCT00808652', 'NCT00809029', 'NCT00832442', 'NCT00952744', 'NCT00979199', 'NCT01162902', 'NCT01203696', 'NCT01231165', 'NCT01253486', 'NCT01397994', 'NCT01407146', 'NCT01429194', 'NCT01660594', 'NCT01682057', 'NCT01724567', 'NCT01829659', 'NCT02001545', 'NCT02062424', 'NCT02144636', 'NCT02271568', 'NCT02272920', 'NCT02357212', 'NCT02381340', 'NCT02532699', 'NCT02608255'], '20142': ['NCT00237016', 'NCT00711399', 'NCT01017081', 'NCT01099943', 'NCT01253980', 'NCT01269554', 'NCT01707485', 'NCT02269761', 'NCT02380352', 'NCT02618655'], '20143': ['NCT00001776', 'NCT00003562', 'NCT00024908', 'NCT00047801', 'NCT00124761', 'NCT00172575', 'NCT00303901', 'NCT00578084', 'NCT00579852', 'NCT00633035', 'NCT00897650', 'NCT00918320', 'NCT00963651', 'NCT010

In [7]:
q_docs = {case: [] for case in cases.keys()}
y_train = []
for i, rel in enumerate(eval.relevance_judgments['rel']):
    query = eval.relevance_judgments['query_id'][i]
    doc = eval.relevance_judgments['docid'][i]
    q_docs[str(query)].append(str(doc))

pickle.dump( q_docs, open( "./q_docs.p", "wb" ) )

In [8]:
q_doc_scores = {}

for q in cases.keys():
    for ct in q_docs[q]:
        q_doc_scores["{q}_{ct}".format(q = q, ct = ct)] = []

pickle.dump( q_doc_scores, open( "./q_doc_scores.p", "wb" ) )


In [9]:
q_revDocs = pickle.load( open( "./q_revDocs.p", "rb" ) )

y_train = []
for key in q_doc_scores.keys():
    [q,doc] = key.split("_")
    if doc in q_revDocs[q]:
        y_train.append(1)
    else:
        y_train.append(0)

pickle.dump( y_train, open( "./y_train.p", "wb" ) )

In [14]:
# male -> 30- = 1
# male -> 30-60 = 2
# male -> 60+ = 3
# female -> 30- = 4
# female -> 30-60 = 5
# female -> 60+ = 6
#  30- 7
#  30-60 8
#  60+ -> 9
# c.c 10

import pickle
import Utils 

clinical_trials = pickle.load( open( "./clinical_trials.p", "rb" ) )
q_revDocs = pickle.load( open( "./q_revDocs.p", "rb" ) )

q_label =  Utils.assignLabels(q_revDocs,clinical_trials)

print("Labels",q_label)

pickle.dump( q_label, open( "./q_label.p", "wb" ) )


Labels {'20141': 10, '20142': 10, '20143': 10, '20144': 7, '20145': 9, '20146': 10, '20147': 10, '20148': 10, '20149': 10, '201410': 9, '201411': 9, '201412': 10, '201413': 10, '201414': 9, '201415': 10, '201416': 10, '201417': 9, '201418': 10, '201419': 9, '201420': 7, '201421': 10, '201422': 10, '201423': 10, '201424': 10, '201425': 7, '201426': 7, '201427': 10, '201428': 7, '201429': 10, '201430': 9, '20151': 9, '20152': 9, '20153': 9, '20154': 9, '20155': 8, '20156': 10, '20157': 10, '20158': 10, '20159': 8, '201510': 10, '201511': 9, '201512': 10, '201513': 10, '201514': 8, '201515': 9, '201516': 10, '201517': 10, '201518': 9, '201519': 9, '201520': 9, '201521': 7, '201522': 9, '201523': 10, '201524': 10, '201525': 10, '201526': 10, '201527': 10, '201528': 7, '201529': 10, '201530': 7}


In [None]:
#Get a sample of the queries
q_ids = list(cases.keys());
labels = [q_label[q] for q in q_ids] 
print(labels)
[training_set,test_set] = train_test_split(q_ids, test_size=0.2, train_size = 0.8, random_state=None, shuffle = True, stratify = labels) #TODO não funciona o stratify
pickle.dump( training_set, open( "./training_set.p", "wb" ) )
print("Training set: ", training_set)
pickle.dump( test_set, open( "./test_set.p", "wb" ) )
print("Test set: ", test_set)


In [15]:
ct_keys = list(clinical_trials.keys())
pickle.dump( ct_keys, open( "./ct_keys.p", "wb" ) )

In [17]:
import pickle

corpus = pickle.load( open( "./corpus.p", "rb" ) )

allCorpus = []
for l in corpus.keys():
    allCorpus.append(corpus[l])

pickle.dump( allCorpus, open( "./allCorpus.p", "wb" ) )