In [1]:
# -*- coding: utf-8 -*-
#eucrt_scrape.py

#Program to scrape data on EU clinical trials from clinicaltrialsregister.eu
#
#Creates 5 CSV files, linked by trial_id:
#
#1) eucrt_data has one observation per trial
#2) eucrt_sponsor_data has one observation per trial-sponsor (because trials can have multiple sponsors)
#3) eucrt_imp_data has one observation per trial-drug (investigational medical product); trials usually have multiple IMPs
#4) eucrt_country_data has one observation per trial, with a list of countries in which trial has sites; need to add member states
#   to this list and remove duplicates
#5) eucrt_ms_data has one observation per trial-member state
#
#Variables are described in 2009_02_04_guideline_en.pdf
#
#Created on Fri Jul 10 12:13:51 2015
# 
#@author: margaretkyle
#"""
 
import urllib
import urllib.parse
import urllib.request
import ssl
import csv

In [2]:
https_sslv3_handler = urllib.request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_SSLv3))
opener = urllib.request.build_opener(https_sslv3_handler)
urllib.request.install_opener(opener)

context = ssl._create_unverified_context()

In [3]:
# Define output files
#out_file = "eucrt_data.txt"
datapath='/Users/juqiu/Documents/projectIP/Data/EU/'
csv_file =datapath +  "eucrt_data.csv"
sponsor_file = datapath + "eucrt_sponsor_data.csv"  
imp_file = datapath + "eucrt_imp_data.csv"  
country_file = datapath + "eucrt_country_data.csv"  
ms_file =datapath +"eucrt_ms_data.csv"  

  
# Variables to extract
taglist = [
    "EudraCT Number:", 
    "Link:",
    "A.3 Full title of the trial:",
    "Start Date:",
    "A.4.1 Sponsor's protocol code number:",
    "A.5.1 ISRCTN (International Standard Randomised Controlled Trial) number:",
    "A.5.2 US NCT (ClinicalTrials.gov registry) number:",
    "A.7 Trial is part of a Paediatric Investigation Plan:",
    "A.8 EMA Decision number of Paediatric Investigation Plan:",
    "E.1.1 Medical condition(s) being investigated:",
    "E.1.1.1 Medical condition in easily understood language:",
    "E.1.1.2 Therapeutic area:",
    "E.1.2 Medical condition or disease under investigation:",
    "E.1.2 Version:",
    "E.1.2 Level:",
    "E.1.2 Classification code:",
    "E.1.2 Term:",
    "E.1.2 System Organ Class:",
    "E.1.2 Medical condition or disease under investigation:",
    "E.1.2 Version:",
    "E.1.2 Level:",
    "E.1.2 Classification code:",
    "E.1.2 Term:",
    "E.1.2 System Organ Class:",
    "E.1.3 Condition being studied is a rare disease:",
    "E.2.1 Main objective of the trial:",
    "E.2.2 Secondary objectives of the trial:",
    "E.2.3 Trial contains a sub-study:",
    "E.5.1 Primary end point(s):",
    "E.5.1.1 Timepoint(s) of evaluation of this end point:",
    "E.5.2 Secondary end point(s):",
    "E.5.2.1 Timepoint(s) of evaluation of this end point:",
    "E.6.1 Diagnosis:",
    "E.6.2 Prophylaxis:",
    "E.6.3 Therapy:",
    "E.6.4 Safety:",
    "E.6.5 Efficacy:",
    "E.6.6 Pharmacokinetic:",
    "E.6.7 Pharmacodynamic:",
    "E.6.8 Bioequivalence:",
    "E.6.9 Dose response:",
    "E.6.10 Pharmacogenetic:",
    "E.6.11 Pharmacogenomic:",
    "E.6.12 Pharmacoeconomic:",
    "E.6.13 Others:",
    "E.7.1 Human pharmacology (Phase I):",
    "E.7.1.1 First administration to humans:",
    "E.7.1.2 Bioequivalence study:",
    "E.7.1.3 Other:",
    "E.7.1.3.1 Other trial type description: ",
    "E.7.2 Therapeutic exploratory (Phase II):",
    "E.7.3 Therapeutic confirmatory (Phase III):",
    "E.7.4 Therapeutic use (Phase IV):",
    "E.8.1 Controlled:",
    "E.8.1.1 Randomised:",
    "E.8.1.2 Open:",
    "E.8.1.3 Single blind:",
    "E.8.1.4 Double blind:",
    "E.8.1.5 Parallel group:",
    "E.8.1.6 Cross over:",
    "E.8.1.7 Other:",
    "E.8.2.1 Other medicinal product(s):",
    "E.8.2.2 Placebo:",
    "E.8.2.3 Other:",
    "E.8.2.3.1 Comparator description:",
    "E.8.2.4 Number of treatment arms in the trial:",
    "E.8.5 The trial involves multiple Member States:",
    "E.8.5.1 Number of sites anticipated in the EEA:",
    "E.8.6.1 Trial being conducted both within and outside the EEA:",
    "E.8.6.2 Trial being conducted completely outside of the EEA:",
    "E.8.6.3 Specify the countries outside of the EEA in which trial sites are planned:",
    "E.8.7 Trial has a data monitoring committee:",
    "E.8.8 Definition of the end of the trial and justification where it is not the last visit of the last subject undergoing the trial:",
    "E.8.9.2 In all countries concerned by the trial years:",
    "E.8.9.2 In all countries concerned by the trial months:",
    "E.8.9.2 In all countries concerned by the trial days:",
    "F.1.1 Trial has subjects under 18:",
    "F.1.1.1 In Utero:",
    "F.1.1.2 Preterm newborn infants (up to gestational age < 37 weeks):",
    "F.1.1.3 Newborns (0-27 days):",
    "F.1.1.4 Infants and toddlers (28 days-23 months):",
    "F.1.1.5 Children (2-11years):",
    "F.1.1.6 Adolescents (12-17 years):",
    "F.1.2 Adults (18-64 years):",
    "F.1.2.1 Number of subjects for this age range:",
    "F.1.3 Elderly (>=65 years):",
    "F.1.3.1 Number of subjects for this age range:",
    "F.2.1 Female:",
    "F.2.2 Male:",
    "F.3.1 Healthy volunteers:",
    "F.3.2 Patients:",
    "F.3.3 Specific vulnerable populations:",
    "F.3.3.1 Women of childbearing potential not using contraception :",
    "F.3.3.2 Women of child-bearing potential using contraception:",
    "F.3.3.3 Pregnant women:",
    "F.3.3.4 Nursing women:",
    "F.3.3.5 Emergency situation:",
    "F.3.3.6 Subjects incapable of giving consent personally:",
    "F.3.3.7 Others:",
    "F.4.2.1 In the EEA:",
    "F.4.2.2 In the whole clinical trial:",
    "P. Date of the global end of the trial:"]

mslist = [
    "EudraCT Number:",
    "Sponsor's Protocol Code Number:",
    "National Competent Authority:",
    "Clinical Trial Type:",
    "Trial Status:",
    "Date on which this record was first entered in the EudraCT database:",
    "A.1 Member State Concerned:",
    "E.8.3 The trial involves single site in the Member State concerned:",
    "E.8.4 The trial involves multiple sites in the Member State concerned:",
    "E.8.4.1 Number of sites anticipated in Member State concerned:",
    "E.8.9.1 In the Member State concerned years:",
    "E.8.9.1 In the Member State concerned months:",
    "E.8.9.1 In the Member State concerned days:",
    "F.4.1 In the member state:",
    "N. Competent Authority Decision:",
    "N. Date of Competent Authority Decision:",
    "N. Ethics Committee Opinion of the trial application:",
    "N. Ethics Committee Opinion: Reason(s) for unfavourable opinion:",
    "N. Date of Ethics Committee Opinion:",
    "P. End of Trial Status:"]
    

sponsorlist = [
    "EudraCT Number:", 
    "B.1.1 Name of Sponsor:",
    "B.1.3.4",
    "B.3.1",
    "B.4.1 ",
    "B.4.2 Country:"]

implist = [
    "EudraCT Number:", 
    "D.1.2 and D.1.3 IMP Role:",
    "D.2.1 IMP to be used in the trial has a marketing authorisation:",
    "D.2.1.1.1 Trade name:",
    "D.2.1.1.2 Name of the Marketing Authorisation holder:",
    "D.2.1.2 Country which granted the Marketing Authorisation:",
    "D.2.5 The IMP has been designated in this indication as an orphan drug in the Community:",
    "D.2.5.1 Orphan drug designation number: ",
    "D.3.1 Product name:",
    "D.3.2 Product code:",
    "D.3.4 Pharmaceutical form:",
    "D.3.4.1 Specific paediatric formulation:",
    "D.3.7 Routes of administration for this IMP:",
    "D.3.8 INN - Proposed INN:",
    "D.3.9.1 CAS number:",
    "D.3.9.2 Current sponsor code:",
    "D.3.9.3 Other descriptive name:",
    "D.3.9.4 EV Substance Code:",
    "D.3.10.1 Concentration unit:",
    "D.3.10.2 Concentration type:",
    "D.3.10.3 Concentration number:",
    "D.3.11.1 Active substance of chemical origin:",
    "D.3.11.2 Active substance of biological/ biotechnological origin (other than Advanced Therapy IMP (ATIMP):",
    "D.3.11.3 Advanced Therapy IMP (ATIMP):",
    "D.3.11.3.1 Somatic cell therapy medicinal product:",
    "D.3.11.3.2 Gene therapy medical product:",
    "D.3.11.3.3 Tissue Engineered Product:",
    "D.3.11.3.4 Combination ATIMP (i.e. one involving a medical device):",
    "D.3.11.3.5 Committee on Advanced therapies (CAT) has issued a classification for this product:",
    "D.3.11.4 Combination product that includes a device, but does not involve an Advanced Therapy:",
    "D.3.11.5 Radiopharmaceutical medicinal product:",
    "D.3.11.6 Immunological medicinal product (such as vaccine, allergen, immune serum):",
    "D.3.11.7 Plasma derived medicinal product:",
    "D.3.11.8 Extractive medicinal product:",
    "D.3.11.9 Recombinant medicinal product:",
    "D.3.11.10 Medicinal product containing genetically modified organisms:",
    "D.3.11.11 Herbal medicinal product:",
    "D.3.11.12 Homeopathic medicinal product:",
    "D.3.11.13 Another type of medicinal product:"]

countrylist = [
    "EudraCT Number:", 
    "Country Sites"]


# Variable list
with open(csv_file, encoding='utf-8', mode='w', newline='') as c:
    datafile = csv.writer(c, delimiter='|')
    datafile.writerow(taglist)   
with open(sponsor_file, encoding='utf-8', mode='w', newline='') as c:
    datafile = csv.writer(c, delimiter='|')
    datafile.writerow(sponsorlist)   
with open(imp_file, encoding='utf-8', mode='w', newline='') as c:
    datafile = csv.writer(c, delimiter='|')
    datafile.writerow(implist)   
with open(country_file, encoding='utf-8', mode='w', newline='') as c:
    datafile = csv.writer(c, delimiter='|')
    datafile.writerow(countrylist)   
with open(ms_file, encoding='utf-8', mode='w', newline='') as c:
    datafile = csv.writer(c, delimiter='|')
    datafile.writerow(mslist)  

In [None]:
# This keeps a list of trial IDs, to avoid re-reading duplicate records
trialid_set = set()
  
# Define first part of URL
# This selects all trials, all fields to download
base_url = 'https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page='
 
# Loop through pages
for pagenum in range(1, 1614):
    # This adds which page to download and all trials on that page
    page_url = base_url + str(pagenum) + '&mode=current_page'
    try:
        f = urllib.request.urlopen(page_url, context=context, timeout=30)
        plaintext = f.read()
#        with open(out_file, 'a', newline='', encoding='UTF-8') as w:
#            w.write(str(plaintext, 'utf-8'))
        print ("Downloaded %s" % page_url)
        # Remove header and decode
        recorddecode = plaintext.decode(encoding='utf-8').partition("Summary")
        # Split into list of records
        recordtext = recorddecode[2]
        recordlist = recordtext.split("Summary\r\n")
        # Loop each record        
        for record in recordlist:
            # Find trial ID
            trial_id = record.partition('EudraCT Number:')[2]
            trial_id = trial_id.splitlines()[0].strip()
            # Get trial-specific variables for new records
            if trial_id not in trialid_set:
                trialid_set.add(trial_id)
                var = [None]*len(taglist)
                # Loop through each line of record
                for line in record.splitlines():
                    if line is not '':
                        var[0] = trial_id
                        # These variables should be present only once per record
                        for tagnum in range(1, len(taglist)):
                            if line.find(taglist[tagnum]) > -1:                      
                                var[tagnum] = line[line.find(':')+1:].strip()
                with open(csv_file, encoding='utf-8', mode='a', newline='') as c:
                    datafile = csv.writer(c, delimiter='|')
                    datafile.writerow(var)        
                msvar = [None]*len(mslist)
                # Member-state specific details
                for line in record.splitlines():
                    msvar[0] = trial_id
                    if line is not '':
                        # These variables should be present only once per record
                        for tagnum in range(1, len(mslist)):
                            if line.find(mslist[tagnum]) > -1:                      
                                msvar[tagnum] = line[line.find(':')+1:].strip()
                with open(ms_file, encoding='utf-8', mode='a', newline='') as c:
                    datafile = csv.writer(c, delimiter='|')
                    datafile.writerow(msvar)        
                # Partition the record to yield only the list of countries in which trial is conducted
                countryinfo = record.partition('E.8.6.3 If E.8.6.1 or E.8.6.2 are Yes, specify the regions in which trial sites are planned:\r\n')[2]
                if countryinfo == '':
                    countryinfo = record.partition('E.8.6.3 Specify the countries outside of the EEA in which trial sites are planned:')[2]
                countryinfo = countryinfo.partition('E.8.7 Trial has a data monitoring committee:')[0]
                if len(countryinfo) > 5:
                    countryvar = [None]*len(countrylist)
                    countryvar[0] = trial_id
                    countryvar[1] = countryinfo.splitlines()
                    with open(country_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(countryvar)        
                # Partition the record to yield only the sponsor information
                sponsorinfo = record.partition('B. Sponsor Information')[2]
                sponsorinfo = sponsorinfo.partition('D. IMP Identification')[0]
                # Split again into partitions for each sponsor
                sponsor1data = sponsorinfo.partition('Sponsor 2')[0]
                sponsor2data = sponsorinfo.partition('Sponsor 2')[2]
                sponsor2data = sponsor2data.partition('Sponsor 3')[0]
                sponsor3data = sponsor2data.partition('Sponsor 3')[2]
                if sponsor1data != '':
                    sponsorvar = [None]*len(sponsorlist)
                    sponsorvar[0] = trial_id
                    for line in sponsor1data.splitlines():
                        for sponsornum in range(1, len(sponsorlist)):
                            if line.find(sponsorlist[sponsornum]) > -1:                      
                                sponsorvar[sponsornum] = line[line.find(':')+1:].strip()
                    with open(sponsor_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(sponsorvar)        
                if sponsor2data != '':
                    sponsorvar = [None]*len(sponsorlist)
                    sponsorvar[0] = trial_id
                    for line in sponsor2data.splitlines():
                        for sponsornum in range(1, len(sponsorlist)):
                            if line.find(sponsorlist[sponsornum]) > -1:                      
                                sponsorvar[sponsornum] = line[line.find(':')+1:].strip()
                    with open(sponsor_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(sponsorvar)        
                if sponsor3data != '':
                    sponsorvar = [None]*len(sponsorlist)
                    sponsorvar[0] = trial_id
                    for line in sponsor3data.splitlines():
                        for sponsornum in range(1, len(sponsorlist)):
                            if line.find(sponsorlist[sponsornum]) > -1:                      
                                sponsorvar[sponsornum] = line[line.find(':')+1:].strip()
                    with open(sponsor_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(sponsorvar)        
                # Partition the record to yield only the IMP information
                impinfo = record.partition('D. IMP Identification')[2]
                impinfo = impinfo.partition('D.8 Information on Placebo')[0]
                # Split again into partitions for each IMP
                imp1data = impinfo.partition('D.IMP: 2')[0]
                imp2data = impinfo.partition('D.IMP: 2')[2]
                imp2data = imp2data.partition('D.IMP: 3')[0]
                imp3data = imp2data.partition('D.IMP: 3')[2]
                imp3data = imp3data.partition('D.IMP: 4')[0]
                imp4data = imp3data.partition('D.IMP: 4')[2]
                imp4data = imp4data.partition('D.IMP: 5')[0]
                imp5data = imp4data.partition('D.IMP: 5')[2]
                if imp1data != '':
                    impvar = [None]*len(implist)
                    impvar[0] = trial_id
                    for line in imp1data.splitlines():
                        for impnum in range(1, len(implist)):
                            if line.find(implist[impnum]) > -1:                      
                                impvar[impnum] = line[line.find(':')+1:].strip()
                    with open(imp_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(impvar)        
                if imp2data != '':
                    impvar = [None]*len(implist)
                    impvar[0] = trial_id
                    for line in imp2data.splitlines():
                        for impnum in range(1, len(implist)):
                            if line.find(implist[impnum]) > -1:                      
                                impvar[impnum] = line[line.find(':')+1:].strip()
                    with open(imp_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(impvar)        
                if imp3data != '':
                    impvar = [None]*len(implist)
                    impvar[0] = trial_id
                    for line in imp3data.splitlines():
                        for impnum in range(1, len(implist)):
                            if line.find(implist[impnum]) > -1:                      
                                impvar[impnum] = line[line.find(':')+1:].strip()
                    with open(imp_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(impvar)        
                if imp4data != '':
                    impvar = [None]*len(implist)
                    impvar[0] = trial_id
                    for line in imp4data.splitlines():
                        for impnum in range(1, len(implist)):
                            if line.find(implist[impnum]) > -1:                      
                                impvar[impnum] = line[line.find(':')+1:].strip()
                    with open(imp_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(impvar)        
                if imp5data != '':
                    impvar = [None]*len(implist)
                    impvar[0] = trial_id
                    for line in imp5data.splitlines():
                        for impnum in range(1, len(implist)):
                            if line.find(implist[impnum]) > -1:                      
                                impvar[impnum] = line[line.find(':')+1:].strip()
                    with open(imp_file, encoding='utf-8', mode='a', newline='') as c:
                        datafile = csv.writer(c, delimiter='|')
                        datafile.writerow(impvar)        
            # Extract only member state-specific variables if other info already recorded
            elif trial_id in trialid_set:
                msvar = [None]*len(mslist)
                # Loop through each line of record
                for line in record.splitlines():
                    msvar[0] = trial_id
                    if line is not '':
                        # These variables should be present only once per record
                        for tagnum in range(1, len(mslist)):
                            if line.find(mslist[tagnum]) > -1:                      
                                msvar[tagnum] = line[line.find(':')+1:].strip()
                with open(ms_file, encoding='utf-8', mode='a', newline='') as c:
                    datafile = csv.writer(c, delimiter='|')
                    datafile.writerow(msvar)        

    except:
        print ("Could not fetch %s" % page_url)


Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=1&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=2&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=3&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=4&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=5&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=6&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=7&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=8&mode=current_page
Downloaded https://www.clinicaltrialsregister.eu/ctr-search/rest/download/full?query=&page=9&mode=current_page
D