In [1]:
#import xml.etree.ElementTree as ET
import itertools
import lxml.etree as ET
import os
import csv
from tqdm.notebook import tqdm
import psycopg2

import drug_central_sql

#https://www.w3schools.com/xml/xpath_syntax.asp

#tree = ET.parse('country_data.xml')
#root = tree.getroot()

In [2]:
#In the AllPublicXML release from clinicaltrials.gov, each clinical trial is
# stored in a directory NCT1234xxxx. So to get NCT12345678.xml we need the
# file path NCT1234xxxx/NCT12345678.xml . All this code does is construct
# that directory header and append it to the front of the file path.
def getNCTFilePath(nct):
    dirname = nct[0:7] + "xxxx"
    p = os.path.join(dirname,nct)
    p2 = os.path.join("..","AllPublicXML")
    p = os.path.join(p2,p)
    return p

#Opens the XML file passed in through the "path" parameter. 
def getRootFromPath(path):
    tree = ET.parse(path)
    root = tree.getroot()
    return root

def getPropertyFromRoot(root,prop):
    x = root.find(prop)
    if(x==None):return "N/A"
    return x.text

#Searches
# the XML for a "condition" element immediately off of the first level
# of the XML tree. If it cannot find it, we return N/A. If we find it
# return the text (the condition).
def getConditionFromRoot(root):
    y = getPropertyFromRoot(root, "condition")
    return y

def getPhaseFromRoot(root):
    y = getPropertyFromRoot(root, "phase")
    return y

def getLink(NCT):
    link = "https://clinicaltrials.gov/ct2/show/"
    return link + NCT

def getPVal_iter(root):
    return root.iter("p_value")

#If we can find a single p_value field with value <0.05, return True immediately.
def check005LessPVal(root):
    pval_iter = getPVal_iter(root)
    for x in find005LessPValIter(pval_iter):
        return True
    return False

#Builds a python iterator object. Goes through the p_value's from the XML
# tree one at a time. If the value of the text is <=0.05, yield it.
def find005LessPValIter(p_val_iter):
    for p_val in p_val_iter:
        outcome_type = p_val.xpath("../../../type")[0].text
        
        if(outcome_type!="Primary"):continue
        
        ptext = p_val.text

        ptext = ptext.replace(" ","")
        ptext = ptext.replace("=","")
        ptext = ptext.replace("p","")
        ptext = ptext.replace(",",".")
#       print(trial_name,ptext)

        if(ptext[0]==">"):
            p_float = float(ptext.replace(">",""))
            #We found a good probablility, stop searching
            if(p_float < 0.05):
                yield p_val
        elif("<" in ptext):
            p_float = float(ptext.replace("<",""))
            #We found a good probablility, stop searching
            if(p_float <= 0.05):
                yield p_val
        else:
            p_float = float(ptext)
            #We found a good probablility, stop searching.
            if(p_float <= 0.05):
                yield p_val
    return False

def pvalueToFloat(ptext):
    ptext = ptext.replace(" ","")
    ptext = ptext.replace("=","")
    ptext = ptext.replace("p","")
    ptext = ptext.replace(",",".")
    ptext = ptext.replace(">","")
    ptext = ptext.replace("<","")
    return float(ptext)
    
def getEnrollmentAndType(root):
    x = root.find("enrollment")
    if(x==None): return ("N/A","N/A")
    typ = x.get("type")
    text = x.text
    return (typ,text)

def getIntervention(root):
    inds = root.xpath("//clinical_study/intervention")
    drug_cnt = 0
    bio_cnt = 0
    ind_info = []
    ind_list = []
    for ind in inds:
        ind_type = getPropertyString(ind,"intervention_type")
        ind_name = getPropertyString(ind,"intervention_name")
        ind_description = getPropertyString(ind,"description") 
        if(ind_type=="Drug"): drug_cnt+=1
        if(ind_type=="Biological"): bio_cnt+=1
        ind_info.append(f"{ind_type}:{ind_name}/{ind_description}")
        ind_list.append((ind_type,ind_name,ind_description))
    int_info_str = "|".join(ind_info)
    return (int_info_str,drug_cnt,bio_cnt, ind_list)
        

#Iterates through all XML files we have saved from clincal trials
# and yields an LXML object which parses that XML.
def getTrialIter():
    with open("p-value-cnt.txt") as f:
        next(f) #header
        for line in tqdm(f):
            (fname, _) = line.split(":")
            path = getNCTFilePath(fname)
            root = getRootFromPath(path)
            yield root
    
def getTrialIter005LessPVal():
    for root in getTrialIter():
        if(check005LessPVal(root)): yield root
            
def getGroupInfo(outcome):
    group_info = []
    groups = outcome.xpath("measure/analyzed_list/analyzed/count_list/count")
    for group in groups:
        group_id = group.get("group_id")
        value = group.get("value")
        group_info.append(group_id + "/" + value)
    return "|".join(group_info)

def getPropertyString(node,property_xpath,exclude_NA=False):
    vals = node.xpath(property_xpath)
    #XPath returns a list of all xml nodes matching our pattern. What
    # we do is grab the text from each node, and combine them in a list.
    if(exclude_NA): vals_text_list = [val.text for val in vals if val.text!="NA"]
    else:vals_text_list = [val.text for val in vals]
            #Combines all text fields into a single value. So
            # 3 fields would be stored in a list [0.01,0.02,0.04]
            # becomes 0.01|0.02|0.04
    if(len(vals_text_list)==0):return "N/A"
    property_string = "|".join(vals_text_list)
    return property_string

def getPropertyList(node,property_xpath,exclude_NA=False,getTextOnly=False):
    vals = node.xpath(property_xpath)
    #XPath returns a list of all xml nodes matching our pattern. What
    # we do is grab the text from each node, and combine them in a list.
    if(exclude_NA): val_list = [val for val in vals if val.text!="NA"]
    else: val_list = [val for val in vals]
    
    if(len(val_list)==0):return []
    
    if(getTextOnly): return [val.text for val in val_list]
    else: return val_list

In [3]:
t = getNCTFilePath("NCT00006392.xml")

In [4]:
tree = ET.parse(t)
root = tree.getroot()

In [5]:
x = root.find("condition").text

In [10]:
with open("p-value-cnt.txt") as f, open('clinicaltrial-pvalue.csv', 'w', newline='', encoding='utf-8') as csvfile:
    #spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    #spamwriter.writerow(["NCT","P-Value Count","Condition","Phase"])
    fieldnames = ["NCT","P-Value-Count","Condition","Phase","Link"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    next(f) #header
    cnt = 0
    for line in tqdm(f):
        
        trial_name = line.split('.')[0]
        (fname, pval) = line.split(":")
        path = getNCTFilePath(fname)
        root = getRootFromPath(path)
        condition = getConditionFromRoot(root)
        phase = getPhaseFromRoot(root)
        
        link = getLink(trial_name)
        #spamwriter.writerow([trial_name,int(pval),condition,phase])
        row_dict = {"NCT":trial_name,
                    "P-Value-Count":int(pval),
                    "Condition":condition,
                    "Phase":phase,
                    "Link":link
                 }
        writer.writerow(row_dict)
        cnt+=1
        if(cnt>10):break

0it [00:00, ?it/s]

In [11]:
with open("p-value-cnt.txt") as f:
    next(f) #header
    cnt = 0
    for line in tqdm(f):
        
        trial_name = line.split('.')[0]
        (fname, pval) = line.split(":")
        path = getNCTFilePath(fname)
        root = getRootFromPath(path)
        condition = getConditionFromRoot(root)
        phase = getPhaseFromRoot(root)
        
        link = getLink(trial_name)
        #spamwriter.writerow([trial_name,int(pval),condition,phase])
        row_dict = {"NCT":trial_name,
                    "P-Value-Count":int(pval),
                    "Condition":condition,
                    "Phase":phase,
                    "Link":link
                 }
#        writer.writerow(row_dict)
        cnt+=1
        iter_ps = getP_Val_iter(root)
        #print(trial_name)
        less_than_point_zero_five = find005LessPVal(iter_ps)
        #print(trial_name,less_than_point_zero_five,getLink(trial_name))
        #if(cnt>100):break

0it [00:00, ?it/s]

NameError: name 'getP_Val_iter' is not defined

In [12]:
#NCT
#Condition
#Intervention/Treatment
#Study type
#Enrollment (estimated)
#Ages eligible for study
#Sexes eligible for study
#P-values
#Endpoint type (primary/secondary) related to the p-values
#All other data related to that p-value, including comparison group, comments, etc.
#Link to ct record
#Associated publications
#MeSH terms

In [13]:
with open("p-value-cnt.txt") as f:
    next(f) #header
    cnt = 0
    for line in tqdm(f):
        
        nct = line.split('.')[0]
        (fname, pval) = line.split(":")
        path = getNCTFilePath(fname)
        root = getRootFromPath(path)
        (enrollment_type, enrollment) = getEnrollmentAndType(root)
        
        #For each property, we want, we explore the XML tree using XPath.
        # when we find the property, we grab it and put it in another
        # dictonary.
        xpath_dict = {
            "gender":"//clinical_study/eligibility/gender",
            "minimum_age":"//clinical_study/eligibility/minimum_age",
            "maximum_age":"//clinical_study/eligibility/maximum_age"
        }
        
        property_dict = {"NCT":nct,
                        "enrollment":enrollment,
                        "enrollment_type":enrollment_type}
        
        for key in xpath_dict:
            xpath_for_key = xpath_dict[key]
            
            vals = root.xpath(xpath_for_key)
            #Gets every property found using the provided XPath, gets
            #it's text, and makes it a list.
            property_text = [val.text for val in vals]
            #Combines all text fields into a single value. So
            # 3 fields would be stored in a list [0.01,0.02,0.04]
            # becomes 0.01|0.02|0.04
            property_for_key = "|".join(property_text)
            property_dict[key] = property_for_key
        
        cnt+=1
        if(cnt>1000):
            break

0it [00:00, ?it/s]

### Get drug properties if available. 

In [6]:
drugname_to_props = {}
with open('clinical-trial-drugs-properties.csv',encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for d in reader:
        drug_name = d["drug_name"]
        drugname_to_props[drug_name] = d
def getInterventionProp(intervention,prop):
    if(intervention not in drugname_to_props): return None
    ind_dict = drugname_to_props[intervention]
    return ind_dict.get(prop,None)

In [None]:
with open("p-value-cnt.txt") as f, open('clinicaltrial-pvalue.csv', 'w', newline='', encoding='utf-8') as csvfile:
    next(f) #header
    fieldnames = ["nct","brief_title","phase","start_date","completion_date","condition","interventions","drug_interventions","enrollment",
                  "enrollment_type","gender","minimum_age",
                  "maximum_age","healthy_volunteers","reference_pmids","result_pmids",
                  "condition_mesh_terms","intervention_mesh_terms","link","outcome_type",
                  "outcome_title","outcome_timeframe","outcome_description","outcome_groups","analysis_method",
                  "analysis_groups","analysis_pvalue","pvalue_number"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    next(f) #header
    cnt = 0
    for line in tqdm(f):
        
        #Our list of files is stored as entries like NCT00278278.xml, we want NCT00278278, so we remove the end piece.
        nct = line.split('.')[0]
        link = getLink(nct)

        (fname, pval) = line.split(":")
        path = getNCTFilePath(fname)
        root = getRootFromPath(path)
        
        #Checks the current study to see if *ANY* of the p-values in it's statistical tests are <0.05.
        # If this is not the case, we "continue" the loop, essentially bypassing this trial from our output.
        if(not check005LessPVal(root)):continue

        condition = getConditionFromRoot(root)
        phase = getPhaseFromRoot(root)
        
        #This needs to be processed specially because enrollment
        # type is stored as a tag on the <enrollment> xml element.
        (enrollment_type, enrollment) = getEnrollmentAndType(root)
        
        (int_info_str,drug_cnt,bio_cnt, inter_list) = getIntervention(root)
        
        #Removes clinical trials without a single drug intervention.
        if(drug_cnt==0 & bio_cnt==0):continue
        
        #For each property, we want, we explore the XML tree using XPath.
        # when we find the property, we grab it and put it in another
        # dictonary.
        xpath_dict = {
            "gender":"//clinical_study/eligibility/gender",
            "start_date":"//clinical_study/start_date",            
            "completion_date":"//clinical_study/completion_date",
            "minimum_age":"//clinical_study/eligibility/minimum_age",
            "maximum_age":"//clinical_study/eligibility/maximum_age",
            "phase":"//clinical_study/phase",
            "condition":"//clinical_study/condition",
            "healthy_volunteers":"//clinical_study/eligibility/healthy_volunteers",
            "reference_pmids":"//clinical_study/results_reference/PMID",
            "result_pmids":"//clinical_study/reference/PMID",
            "condition_mesh_terms":"//clinical_study/condition_browse/mesh_term",
            "intervention_mesh_terms":"//clinical_study/intervention_browse/mesh_term",
            "link":"//clinical_study/required_header/url",
            "brief_title":"//clinical_study/brief_title",
            "nct":"//clinical_study/id_info/nct_id"
        }
        
        property_dict = {"enrollment":enrollment,
                         "enrollment_type":enrollment_type,
                         "interventions":int_info_str,
                         "drug_interventions":drug_cnt
                        }

        
        for key in xpath_dict:
            xpath_for_key = xpath_dict[key]
            
            #vals = root.xpath(xpath_for_key)
            #Gets every property found using the provided XPath, gets
            #it's text, and makes it a list.
            #property_text = [val.text for val in vals]
            #Combines all text fields into a single value. So
            # 3 fields would be stored in a list [0.01,0.02,0.04]
            # becomes 0.01|0.02|0.04
            #property_for_key = "|".join(property_text)
            property_for_key = getPropertyString(root,xpath_for_key,exclude_NA=True)
            
            property_dict[key] = property_for_key

        outcomes = root.xpath("//clinical_study/clinical_results/outcome_list/outcome")
        for outcome in outcomes:
            #XKCD
            property_dict["outcome_type"] = getPropertyString(outcome,"type")
            
            if(property_dict["outcome_type"]!="Primary"):continue
            
            property_dict["outcome_title"] = getPropertyString(outcome,"title")
            property_dict["outcome_description"] = getPropertyString(outcome,"description")
            property_dict["outcome_timeframe"] = getPropertyString(outcome,"time_frame")
            
            property_dict["outcome_groups"] = getGroupInfo(outcome)
            

            for p_val in outcome.xpath("analysis_list/analysis/p_value"):
                property_dict["analysis_method"] = getPropertyString(p_val,"../method")
                property_dict["analysis_groups"] = getPropertyString(p_val,"../group_id_list/group_id")
                property_dict["analysis_pvalue"] = p_val.text
                try:
                    property_dict["pvalue_number"] = pvalueToFloat(p_val.text)
                #Some strange clinical trials don't give a number for their p-value... We skip those.
                except ValueError:
                    continue
                writer.writerow(property_dict)
        cnt+=1

In [7]:
with open("p-value-cnt.txt") as f, open('clinicaltrial-pvalue-interventions-mapped-to-drugcentral.csv', 'w', newline='', encoding='utf-8') as csvfile:
    next(f) #header
    fieldnames = ["nct","brief_title","phase","start_date","completion_date","condition","interventions","drug_interventions","enrollment",
                  "enrollment_type","gender","minimum_age",
                  "maximum_age","healthy_volunteers","reference_pmids","result_pmids",
                  "condition_mesh_terms","intervention_mesh_terms","link","intervention_status","intervention_description", "intervention_cas_no", "intervention_fda_label", "intervention_drugcentral_id", "intervention_link","intervention_fda_approval","intervention_ema_approval","intervention_pmda_approval",
                  "outcome_type","outcome_title","outcome_timeframe","outcome_description","outcome_groups","analysis_method",
                  "analysis_groups","analysis_pvalue","pvalue_number"]
    

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    next(f) #header
    cnt = 0
    for line in tqdm(f):
        
        #Our list of files is stored as entries like NCT00278278.xml, we want NCT00278278, so we remove the end piece.
        nct = line.split('.')[0]
        link = getLink(nct)

        (fname, pval) = line.split(":")
        path = getNCTFilePath(fname)
        root = getRootFromPath(path)
        
        #Checks the current study to see if *ANY* of the p-values in it's statistical tests are <0.05.
        # If this is not the case, we "continue" the loop, essentially bypassing this trial from our output.
        if(not check005LessPVal(root)):continue

        condition = getConditionFromRoot(root)
        phase = getPhaseFromRoot(root)
        
        #This needs to be processed specially because enrollment
        # type is stored as a tag on the <enrollment> xml element.
        (enrollment_type, enrollment) = getEnrollmentAndType(root)
        
        (int_info_str,drug_cnt,bio_cnt, inter_list) = getIntervention(root)
        
        #Removes clinical trials without a single drug intervention.
        if(drug_cnt==0 & bio_cnt==0):continue
        
        #For each property, we want, we explore the XML tree using XPath.
        # when we find the property, we grab it and put it in another
        # dictonary.
        xpath_dict = {
            "gender":"//clinical_study/eligibility/gender",
            "start_date":"//clinical_study/start_date",            
            "completion_date":"//clinical_study/completion_date",
            "minimum_age":"//clinical_study/eligibility/minimum_age",
            "maximum_age":"//clinical_study/eligibility/maximum_age",
            "phase":"//clinical_study/phase",
            "condition":"//clinical_study/condition",
            "healthy_volunteers":"//clinical_study/eligibility/healthy_volunteers",
            "reference_pmids":"//clinical_study/results_reference/PMID",
            "result_pmids":"//clinical_study/reference/PMID",
            "condition_mesh_terms":"//clinical_study/condition_browse/mesh_term",
            "intervention_mesh_terms":"//clinical_study/intervention_browse/mesh_term",
            "link":"//clinical_study/required_header/url",
            "brief_title":"//clinical_study/brief_title",
            "nct":"//clinical_study/id_info/nct_id"
        }
        
        property_dict = {"enrollment":enrollment,
                         "enrollment_type":enrollment_type,
                         "interventions":int_info_str,
                         "drug_interventions":drug_cnt
                        }

        
        for key in xpath_dict:
            xpath_for_key = xpath_dict[key]

            property_for_key = getPropertyString(root,xpath_for_key,exclude_NA=True)
            
            property_dict[key] = property_for_key

        for (inter_type,intervention,desc) in inter_list:
            property_dict["intervention_description"] = getInterventionProp(intervention,"description")
            property_dict["intervention_cas_no"] = getInterventionProp(intervention,"cas_number")
            property_dict["intervention_fda_label"] = getInterventionProp(intervention,"fda_label")
            property_dict["intervention_drugcentral_id"] = getInterventionProp(intervention,"drugcentral_id")
            property_dict["intervention_link"] = getInterventionProp(intervention,"link")
            property_dict["intervention_status"] = getInterventionProp(intervention,"status")
            property_dict["intervention_fda_approval"] = getInterventionProp(intervention,"fda_approval")
            property_dict["intervention_ema_approval"] = getInterventionProp(intervention,"ema_approval")
            property_dict["intervention_pmda_approval"] = getInterventionProp(intervention,"pmda_approval")
            property_dict["interventions"] = f"{inter_type}:{intervention}"

            outcomes = root.xpath("//clinical_study/clinical_results/outcome_list/outcome")
            for outcome in outcomes:
                #XKCD
                property_dict["outcome_type"] = getPropertyString(outcome,"type")

                if(property_dict["outcome_type"]!="Primary"):continue

                property_dict["outcome_title"] = getPropertyString(outcome,"title")
                property_dict["outcome_description"] = getPropertyString(outcome,"description")
                property_dict["outcome_timeframe"] = getPropertyString(outcome,"time_frame")

                property_dict["outcome_groups"] = getGroupInfo(outcome)


                for p_val in outcome.xpath("analysis_list/analysis/p_value"):
                    property_dict["analysis_method"] = getPropertyString(p_val,"../method")
                    property_dict["analysis_groups"] = getPropertyString(p_val,"../group_id_list/group_id")
                    property_dict["analysis_pvalue"] = p_val.text
                    try:
                        property_dict["pvalue_number"] = pvalueToFloat(p_val.text)
                    #Some strange clinical trials don't give a number for their p-value... We skip those.
                    except ValueError:
                        continue
                    writer.writerow(property_dict)
            cnt+=1

0it [00:00, ?it/s]