In [1]:
import pandas as pd
import numpy as np
import csv
import pickle
from difflib import SequenceMatcher

### US SPONSORS & ENROLMENT DATA

In [2]:
### us_sponsors
df_us_sponsors = pd.read_csv('../trial_sponsors/usa_sponsor.csv', sep='|')
# Determining duplicates
mask = df_us_sponsors.duplicated(keep=False)
double = df_us_sponsors[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### us_enrolment
df_us_enrolment = pd.read_csv('../trial_enrolment/us_enrolment.csv', sep='|')
# Determining duplicates
mask = df_us_enrolment.duplicated(keep=False)
double = df_us_enrolment[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### merge 
df_us = pd.merge(df_us_sponsors, df_us_enrolment, on='trial_id')
df_us[0:3]

Unnamed: 0,trial_id,primary_spons,sponsor_type,enrolment
0,NCT03260985,Stanford University,Non-Commercial,75.0
1,NCT03268473,Bezmialem Vakif University,Non-Commercial,60.0
2,NCT03262610,"Rhythm Pharmaceuticals, Inc.",Commercial,


In [3]:
len(df_us)

268859

### EU SPONSORS & ENROLMENT DATA

In [4]:
### eu_sponsors
df_eu_sponsors = pd.read_csv('../trial_sponsors/eu_sponsor.csv', sep='|')
# Determining duplicates
mask = df_eu_sponsors.duplicated(keep=False)
double = df_eu_sponsors[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### eu_enrolment
df_eu_enrolment = pd.read_csv('../trial_enrolment/eu_enrolment.csv', sep='|')
# Determining duplicates
mask = df_eu_enrolment.duplicated(keep=False)
double = df_eu_enrolment[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### merge
df_eu = pd.merge(df_eu_sponsors, df_eu_enrolment, on='trial_id')
df_eu[0:3]

Unnamed: 0,trial_id,primary_spons1,sponsor_type1,primary_spons2,sponsor_type2,primary_spons3,sponsor_type3,primary_spons4,sponsor_type4,enrolment
0,2004-000007-18,GlaxoSmithKline AB,Commercial,,,,,,,1110.0
1,2004-000012-13,Wyeth Research Division of Wyeth Pharmaceutica...,Commercial,,,,,,,360.0
2,2004-000015-25,Wyeth Research Division of Wyeth Pharmaceutica...,Commercial,,,,,,,1236.0


In [5]:
df_eu = df_eu.rename(index=str, columns={"primary_spons1": "primary_spons", 
                                         "sponsor_type1": "prim_sponsor_type", 
                                         "primary_spons2": "secondary_spons1", 
                                         "sponsor_type2": "sec_sponsor_type1",
                                         "primary_spons3": "secondary_spons2",
                                         "sponsor_type3": "sec_sponsor_type2",
                                         "primary_spons4": "secondary_spons3" ,
                                         "sponsor_type4": "sec_sponsor_type3",})
df_eu[0:1]

Unnamed: 0,trial_id,primary_spons,prim_sponsor_type,secondary_spons1,sec_sponsor_type1,secondary_spons2,sec_sponsor_type2,secondary_spons3,sec_sponsor_type3,enrolment
0,2004-000007-18,GlaxoSmithKline AB,Commercial,,,,,,,1110.0


In [6]:
len(df_eu)

32237

### ICTRP SPONSORS & ENROLMENT DATA

In [7]:
### ictrp_sponsors
df_ictrp_sponsors = pd.read_csv('../trial_sponsors/ictrp_sponsor.csv', sep='|')
# Determining duplicates
mask = df_ictrp_sponsors.duplicated(keep=False)
double = df_ictrp_sponsors[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### ictrp_enrolment
df_ictrp_enrolment = pd.read_csv('../trial_enrolment/ictrp_enrolment.csv', sep='|')
# Determining duplicates
mask = df_ictrp_enrolment.duplicated(keep=False)
double = df_ictrp_enrolment[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### merge
df_ictrp = pd.merge(df_ictrp_sponsors, df_ictrp_enrolment, on='trial_id')
df_ictrp[0:3]

Unnamed: 0,trial_id,primary_spons,sponsor_type,enrolment
0,ACTRN12605000058673,Commercial sector/Industry Pharmalab,Commercial,190.0
1,ACTRN12607000355471,Individual Maree Teesson,Commercial,1858.0
2,ACTRN12609000649213,Individual Professor Jon Currie,Commercial,56.0


In [8]:
len(df_ictrp)

114137

In [9]:
#To make it possible to compare sponsor_type
#Industry == commercial 
#U.S.Fed == Non-commercial 
#NIH == Non-commercial 

for i in range(0,len(df_us_sponsors)):
    if (df_us_sponsors.sponsor_type[i] == "Industry") or (df_us_sponsors.sponsor_type[i] == "industry") :
        df_us_sponsors.sponsor_type[i] = "Commercial"

for i in range(0,len(df_us_sponsors)):
    if (df_us_sponsors.sponsor_type[i] == "U.S.Fed") or (df_us_sponsors.sponsor_type[i] == "NIH") or (df_us_sponsors.sponsor_type[i] == "U.S. Fed") :
        df_us_sponsors.sponsor_type[i] = "Non-Commercial"
        

In [10]:
df_us_sponsors.sponsor_type[0:2]

0    Non-Commercial
1    Non-Commercial
Name: sponsor_type, dtype: object

### FUNCTIONS

In [11]:
### calculate similarity with "SequenceMatcher" where space isJunk 
def similar(a, b):
    return SequenceMatcher(lambda x: x == " ", a.lower(), b.lower()).ratio()

In [12]:
def is_nan(x):
    return (x is np.nan or x != x)

In [13]:
### Comparing the "Sponsor Type" of resulting lists
def list_sponsor_types(l1,l2):
    new_list = []
    for a, b in zip(l1,l2):
        
        if (is_nan(a) or is_nan(b)):
            new_list.append(np.nan)
        elif (a == b):
            new_list.append(1)
        else: 
            new_list.append(0)
            
    return(new_list)    

In [14]:
### Comparing the "Sponsor Name" of resulting lists
def list_sponsor_names(l1,l2):
    new_list = []
    for a, b in zip(l1, l2):
        if (is_nan(a) or is_nan(b)) :
            new_list.append(np.nan)
        else : 
            sim = similar(a, b)
            new_list.append(round(sim, 5))
    return(new_list)     

In [15]:
### Comparing the "enrolments" of resulting lists
def list_enrolments(l1,l2):
    new_list = []
    for a, b in zip(l1, l2):
        if (is_nan(a) or is_nan(b)) :
            new_list.append(np.nan)
        else : 
            diff = abs(a - b)
            new_list.append(diff)

    return(new_list)

## 01- Working for EU and US pairs

#### EU_US (ltitles_years) DATA

In [16]:
df_eu_us_ltitles_years = pd.read_excel('../_results/01-years_ltitles/new excels/eu_us_years_ltitles_list.xlsx')
df_eu_us = df_eu_us_ltitles_years.rename(index=str, columns={"similarity": "title_similarity"})
df_eu_us[0:3]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_US,year_US,long_title_US,title_similarity
0,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00095342,,"A Double-Blind, Placebo-Controlled, Parallel, ...",1.0
1,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00076206,2003.0,"A Double-Blind, Parallel, Placebo-Controlled, ...",0.809783
2,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00141830,2005.0,"A Randomized, Parallel, Double-Blind, Placebo-...",0.812865


In [17]:
len(df_eu_us)

17306

In [18]:
#Create lists of enrolment, sponsor type & name for the pairs 
#then calculate the similarity and the diffrence 
#and then inject the results in the previous dataframe

In [None]:
#Create lists of enrolment, sponsor type & name for the pairs 
list_EU_prim_sponsor_type = []
list_EU_sec_sponsor1_type = []
list_EU_sec_sponsor2_type = []
list_EU_sec_sponsor3_type = []

list_US_sponsor_type = []

list_EU_prim_sponsor_name = []
list_EU_sec_sponsor1_name = []
list_EU_sec_sponsor2_name = []
list_EU_sec_sponsor3_name = []

list_US_sponsor_name = []

list_EU_enrol = []
list_US_enrol = []

for i in range(0,len(df_eu_us)):
    id_EU = df_eu_us.id_EU[i]
    id_US = df_eu_us.id_US[i]
    
    for j in range(0,len(df_eu)): 
        if (id_EU == df_eu.trial_id[j]) : 
            
            
            list_EU_prim_sponsor_name.append(df_eu.primary_spons[j])
            list_EU_prim_sponsor_type.append(df_eu.prim_sponsor_type[j])
            
            list_EU_sec_sponsor1_name.append(df_eu.secondary_spons1[j])
            list_EU_sec_sponsor1_type.append(df_eu.sec_sponsor_type1[j])
            
            list_EU_sec_sponsor2_name.append(df_eu.secondary_spons2[j])
            list_EU_sec_sponsor2_type.append(df_eu.sec_sponsor_type2[j])
            
            list_EU_sec_sponsor3_name.append(df_eu.secondary_spons3[j])
            list_EU_sec_sponsor3_type.append(df_eu.sec_sponsor_type3[j])
            
            list_EU_enrol.append(df_eu.enrolment[j])
            
            print(i,"id_EU",id_EU)
            
    for k in range(0,len(df_us)):
        if (id_US == df_us.trial_id[k]) :
            
            list_US_sponsor_type.append(df_us.sponsor_type[k])
            list_US_sponsor_name.append(df_us.primary_spons[k])
            list_US_enrol.append(df_us.enrolment[k])
            
            print(i,"id_US:",id_US)
            print('-----------------------')


0 id_EU 2004-000012-13
0 id_US: NCT00095342
-----------------------
1 id_EU 2004-000012-13
1 id_US: NCT00076206
-----------------------
2 id_EU 2004-000012-13
2 id_US: NCT00141830
-----------------------
3 id_EU 2004-000015-25
3 id_US: NCT00083993
-----------------------
4 id_EU 2004-000020-32
4 id_US: NCT00095550
-----------------------
5 id_EU 2004-000020-32
5 id_US: NCT00095394
-----------------------
6 id_EU 2004-000022-75
6 id_US: NCT00095290
-----------------------
7 id_EU 2004-000023-15
7 id_US: NCT00097591
-----------------------
8 id_EU 2004-000028-34
8 id_US: NCT00087711
-----------------------
9 id_EU 2004-000029-31
9 id_US: NCT00287716
-----------------------
10 id_EU 2004-000029-31
10 id_US: NCT00287729
-----------------------
11 id_EU 2004-000033-11
11 id_US: NCT00095550
-----------------------
12 id_EU 2004-000033-11
12 id_US: NCT00095394
-----------------------
13 id_EU 2004-000039-27
13 id_US: NCT00129766
-----------------------
14 id_EU 2004-000046-21
14 id_US: NCT002

In [22]:
#Sponsor Names
EUprim_USprim_sponsor_name = list_sponsor_names(list_EU_prim_sponsor_name, list_US_sponsor_name)
EUsec1_USprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor1_name, list_US_sponsor_name)
EUsec2_USprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor2_name, list_US_sponsor_name)
EUsec3_USprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor3_name, list_US_sponsor_name)

EUprim_USprim_sponsor_name[0:2]

[0.23529, 0.57576]

In [23]:
#Sponsor Types
EUprim_USprim_sponsor_type = list_sponsor_types(list_EU_prim_sponsor_type, list_US_sponsor_type)
EUsec1_USprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor1_type, list_US_sponsor_type)
EUsec2_USprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor2_type, list_US_sponsor_type)
EUsec3_USprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor3_type, list_US_sponsor_type)

EUsec2_USprim_sponsor_type[0:2]

[nan, nan]

In [24]:
#Enrolments
EU_US_enrol = list_enrolments(list_EU_enrol, list_US_enrol)
EU_US_enrol[0:2]

[30.0, 181.0]

In [25]:
#Save the final dataframe

df_eu_us['enrolment_EU'] = list_EU_enrol
df_eu_us['enrolment_US'] = list_US_enrol
df_eu_us['enrolment_diff'] = EU_US_enrol


df_eu_us['spon_US'] = list_US_sponsor_name

df_eu_us['primary_spon_EU'] = list_EU_prim_sponsor_name
df_eu_us['prim_spon_similarity'] = EUprim_USprim_sponsor_name

df_eu_us['secondary_spon1_EU'] = list_EU_sec_sponsor1_name
df_eu_us['sec_spon_similarity1'] = EUsec1_USprim_sponsor_name

df_eu_us['secondary_spon2_EU'] = list_EU_sec_sponsor2_name
df_eu_us['sec_spon_similarity2'] = EUsec2_USprim_sponsor_name

df_eu_us['secondary_spon3_EU'] = list_EU_sec_sponsor3_name
df_eu_us['sec_spon_similarity3'] = EUsec3_USprim_sponsor_name



df_eu_us['spon_type_US'] = list_US_sponsor_type

df_eu_us['primary_spon_type_EU'] = list_EU_prim_sponsor_type
df_eu_us['same_prim_spon_type'] = EUprim_USprim_sponsor_type

df_eu_us['secondary_spon1_type_EU'] = list_EU_sec_sponsor1_type
df_eu_us['same_sec_spon_type1'] = EUsec1_USprim_sponsor_type

df_eu_us['secondary_spon2_type_EU'] = list_EU_sec_sponsor2_type
df_eu_us['same_sec_spon_type2'] = EUsec2_USprim_sponsor_type

df_eu_us['secondary_spon3_type_EU'] = list_EU_sec_sponsor3_type
df_eu_us['same_sec_spon_type3'] = EUsec3_USprim_sponsor_type

In [26]:
df_eu_us[:2]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_US,year_US,long_title_US,title_similarity,enrolment_EU,enrolment_US,enrolment_diff,...,sec_spon_similarity3,spon_type_US,primary_spon_type_EU,same_prim_spon_type,secondary_spon1_type_EU,same_sec_spon_type1,secondary_spon2_type_EU,same_sec_spon_type2,secondary_spon3_type_EU,same_sec_spon_type3
0,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00095342,,"A Double-Blind, Placebo-Controlled, Parallel, ...",0.875,360.0,390.0,30.0,...,,Commercial,Commercial,1.0,,,,,,
1,2004-000020-32,2005.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,NCT00095394,2004.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,0.817204,826.0,645.0,181.0,...,,Commercial,Commercial,1.0,,,,,,


In [29]:
len(df_eu_us)

9571

In [28]:
df_eu_us.to_excel('../_results/02-sponsors_enrolments/NEW_final_eu_us.xlsx')

### TEST

In [110]:
df_eu_us_SUB = df_eu_us_ltitles_years[0:50]
df_eu_us_SUB[0:2]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_US,year_US,long_title_US,similarity
0,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00095342,,"A Double-Blind, Placebo-Controlled, Parallel, ...",0.875
1,2004-000020-32,2005.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,NCT00095394,2004.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,0.817204


In [130]:
list_sponsor_types(list_EU_sponsor_type, list_US_sponsor_type)[0:2]

[1, 1, 1, 1, 1]

In [104]:
list_sponsor_names(list_EU_sponsor_name, list_US_sponsor_name)[0:2]

[0.23529, 0.57576, 0.60606, 0.84, 0.84, 0.33333, 0.33333, 0.57576, 0.57576, 0.64286, 0.64, 0.54545, 0.71429, 0.71429, 1.0, 1.0, 0.85714, 0.60606, 0.23729, 0.66667, 0.71429, 0.8, 0.65455, 0.66667, 0.59091, 0.16393, 0.36111, 0.84, 0.66667, 0.66667, 0.71429, 0.71429, 0.71429, 0.71429, 0.68, 0.24762, 0.28571, 0.28571, 0.19355, 0.8, 0.8, 0.8, 0.14286, 0.52381, 0.52381, 0.52381, 0.71429, 0.71429, 0.71429, 0.52381]


In [92]:
list_enrolments(list_EU_enrol, list_US_enrol)[0:2]

[30.0, 181.0, 215.0, 619.0, 13.0, 165.0, 256.0, 0.0, 149.0, 35.0, 9.0, 285.0, 1.0, 44.0, 34.0, 0.0, 448.0, 'NaN', 1094.0, 38.0, 7537.0, 0.0, 0.0, 1.0, 35.0, 201.0, 269.0, 14.0, 'NaN', 284.0, 2.0, 3.0, 2.0, 3.0, 196.0, 175.0, 3.0, 0.0, 1.0, 322.0, 0.0, 0.0, 0.0, 441.0, 401.0, 53.0, 216.0, 176.0, 172.0, 79.0]


In [112]:
df_eu_us_SUB['spon_type_EU'] = list_EU_sponsor_type
df_eu_us_SUB['spon_type_US'] = list_US_sponsor_type
df_eu_us_SUB['same_spon_type'] = US_EU_sponsor_type

df_eu_us_SUB['sponsor_EU'] = list_EU_sponsor_name
df_eu_us_SUB['sponsor_US'] = list_US_sponsor_name
df_eu_us_SUB['spon_similarity'] = US_EU_sponsor_name

df_eu_us_SUB['enrolment_EU'] = list_EU_enrol
df_eu_us_SUB['enrolment_US'] = list_US_enrol
df_eu_us_SUB['enrolment_diff'] = US_EU_enrol

In [113]:
df_eu_us_SUB[0:2]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_US,year_US,long_title_US,title_similarity,spon_type_EU,spon_type_US,same_spon_type,sponsor_EU,sponsor_US,spon_similarity,enrolment_EU,enrolment_US,enrolment_diff
0,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ...",NCT00095342,,"A Double-Blind, Placebo-Controlled, Parallel, ...",0.875,Commercial,Commercial,1,Wyeth Research Division of Wyeth Pharmaceutica...,Wyeth is now a wholly owned subsidiary of Pfizer,0.23529,360.0,390.0,30
1,2004-000020-32,2005.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,NCT00095394,2004.0,The Efficacy and Safety of Irbesartan/HCTZ Com...,0.817204,Commercial,Commercial,1,Bristol Myers Squibb International Corporation,Bristol-Myers Squibb,0.57576,826.0,645.0,181


In [114]:
df_eu_us_SUB.to_excel('test_eu_us.xlsx')

## 02- Working for EU and ICTRP pairs

#### EU_ICTRP (ltitles_years) DATA

In [251]:
df_eu_ictrp_ltitles_years = pd.read_excel('../_results/01-years_ltitles/Excel/eu_ictrp_years_ltitles_list.xlsx')
df_eu_ictrp = df_eu_ictrp_ltitles_years.rename(index=str, columns={"similarity": "title_similarity"})
df_eu_ictrp[0:3]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_ICTRP,year_ICTRP,long_title_ICTRP,title_similarity
0,2004-000629-32,2006.0,Multicentre international study of capecitabin...,ISRCTN45133151,2005.0,Multicentre international study of capecitabin...,1.0
1,2004-000675-34,2004.0,"A Prospective, Randomized, Controlled, Multi-c...",NTR217,2004.0,"A Prospective, Randomized, Controlled, Multi-c...",0.971429
2,2004-000905-24,2005.0,Cryotherapy versus salicylic acid for the trea...,ISRCTN18994246,2006.0,Cryotherapy versus salicylic acid for the trea...,0.994709


In [252]:
len(df_eu_ictrp)

903

In [253]:
#Create lists of enrolment, sponsor type & name for the pairs 
list_EU_prim_sponsor_type = []
list_EU_sec_sponsor1_type = []
list_EU_sec_sponsor2_type = []
list_EU_sec_sponsor3_type = []

list_ICTRP_sponsor_type = []

list_EU_prim_sponsor_name = []
list_EU_sec_sponsor1_name = []
list_EU_sec_sponsor2_name = []
list_EU_sec_sponsor3_name = []

list_ICTRP_sponsor_name = []

list_EU_enrol = []
list_ICTRP_enrol = []

for i in range(0,len(df_eu_ictrp)):
    id_EU = df_eu_ictrp.id_EU[i]
    id_ICTRP = df_eu_ictrp.id_ICTRP[i]
    
    for j in range(0,len(df_eu)): 
        if (id_EU == df_eu.trial_id[j]) : 
            
            
            list_EU_prim_sponsor_name.append(df_eu.primary_spons[j])
            list_EU_prim_sponsor_type.append(df_eu.prim_sponsor_type[j])
            
            list_EU_sec_sponsor1_name.append(df_eu.secondary_spons1[j])
            list_EU_sec_sponsor1_type.append(df_eu.sec_sponsor_type1[j])
            
            list_EU_sec_sponsor2_name.append(df_eu.secondary_spons2[j])
            list_EU_sec_sponsor2_type.append(df_eu.sec_sponsor_type2[j])
            
            list_EU_sec_sponsor3_name.append(df_eu.secondary_spons3[j])
            list_EU_sec_sponsor3_type.append(df_eu.sec_sponsor_type3[j])
            
            list_EU_enrol.append(df_eu.enrolment[j])
            
            print(i,"id_EU",id_EU)
            
    for k in range(0,len(df_ictrp)):
        if (id_ICTRP == df_ictrp.trial_id[k]) :
            
            list_ICTRP_sponsor_type.append(df_ictrp.sponsor_type[k])
            list_ICTRP_sponsor_name.append(df_ictrp.primary_spons[k])
            list_ICTRP_enrol.append(df_ictrp.enrolment[k])
            
            print(i,"id_ICTRP:",id_ICTRP)
            print('-----------------------')


0 id_EU 2004-000629-32
0 id_ICTRP: ISRCTN45133151
-----------------------
1 id_EU 2004-000675-34
1 id_ICTRP: NTR217
-----------------------
2 id_EU 2004-000905-24
2 id_ICTRP: ISRCTN18994246
-----------------------
3 id_EU 2004-001773-26
3 id_ICTRP: ACTRN12605000644662
-----------------------
4 id_EU 2004-001786-18
4 id_ICTRP: NTR355
-----------------------
5 id_EU 2004-002547-27
5 id_ICTRP: ISRCTN22471573
-----------------------
6 id_EU 2004-002799-41
6 id_ICTRP: ISRCTN84211155
-----------------------
7 id_EU 2004-002902-31
7 id_ICTRP: ACTRN12605000324617
-----------------------
8 id_EU 2004-002904-13
8 id_ICTRP: ACTRN12605000324617
-----------------------
9 id_EU 2004-002914-12
9 id_ICTRP: ACTRN12605000323628
-----------------------
10 id_EU 2004-003753-56
10 id_ICTRP: ISRCTN95698259
-----------------------
11 id_EU 2004-003797-28
11 id_ICTRP: ISRCTN78483537
-----------------------
12 id_EU 2004-003939-32
12 id_ICTRP: ISRCTN82088636
-----------------------
13 id_EU 2004-004029-87
13 i

108 id_ICTRP: ACTRN12607000371493
-----------------------
109 id_EU 2006-005007-34
109 id_ICTRP: NTR949
-----------------------
110 id_EU 2006-005044-86
110 id_ICTRP: NTR775
-----------------------
111 id_EU 2006-005174-42
111 id_ICTRP: NTR1014
-----------------------
112 id_EU 2006-005399-42
112 id_ICTRP: ISRCTN40898239
-----------------------
113 id_EU 2006-005602-31
113 id_ICTRP: NTR797
-----------------------
114 id_EU 2006-005715-10
114 id_ICTRP: ISRCTN61735247
-----------------------
115 id_EU 2006-005772-41
115 id_ICTRP: NTR929
-----------------------
116 id_EU 2006-005796-18
116 id_ICTRP: NTR1383
-----------------------
117 id_EU 2006-005800-15
117 id_ICTRP: ISRCTN89489788
-----------------------
118 id_EU 2006-005802-29
118 id_ICTRP: ISRCTN53794597
-----------------------
119 id_EU 2006-005889-40
119 id_ICTRP: NTR1013
-----------------------
120 id_EU 2006-006032-22
120 id_ICTRP: ISRCTN41829447
-----------------------
121 id_EU 2006-006058-83
121 id_ICTRP: NTR1348
------------

217 id_ICTRP: NTR1273
-----------------------
218 id_EU 2008-001568-35
218 id_ICTRP: ISRCTN07083722
-----------------------
219 id_EU 2008-001579-32
219 id_ICTRP: ISRCTN66772971
-----------------------
220 id_EU 2008-001597-33
220 id_ICTRP: NTR1563
-----------------------
221 id_EU 2008-001677-15
221 id_ICTRP: ISRCTN81349394
-----------------------
222 id_EU 2008-001786-28
222 id_ICTRP: NTR1949
-----------------------
223 id_EU 2008-001827-76
223 id_ICTRP: NTR2134
-----------------------
224 id_EU 2008-001843-20
224 id_ICTRP: ISRCTN02628492
-----------------------
225 id_EU 2008-001968-36
225 id_ICTRP: ISRCTN22663589
-----------------------
226 id_EU 2008-002003-75
226 id_ICTRP: NTR1342
-----------------------
227 id_EU 2008-002110-22
227 id_ICTRP: NTR1432
-----------------------
228 id_EU 2008-002110-22
228 id_ICTRP: NTR1448
-----------------------
229 id_EU 2008-002244-42
229 id_ICTRP: ISRCTN29378424
-----------------------
230 id_EU 2008-002378-37
230 id_ICTRP: ISRCTN64117538
------

327 id_ICTRP: JPRN-JapicCTI-101064
-----------------------
328 id_EU 2009-013701-34
328 id_ICTRP: ISRCTN72824329
-----------------------
329 id_EU 2009-013817-93
329 id_ICTRP: ISRCTN96095682
-----------------------
330 id_EU 2009-013897-42
330 id_ICTRP: ISRCTN88782125
-----------------------
331 id_EU 2009-014037-25
331 id_ICTRP: NTR1881
-----------------------
332 id_EU 2009-014054-14
332 id_ICTRP: ISRCTN31806847
-----------------------
333 id_EU 2009-014315-12
333 id_ICTRP: NTR2316
-----------------------
334 id_EU 2009-014402-33
334 id_ICTRP: NTR1903
-----------------------
335 id_EU 2009-014455-68
335 id_ICTRP: NTR2477
-----------------------
336 id_EU 2009-014523-23
336 id_ICTRP: NTR2205
-----------------------
337 id_EU 2009-014593-18
337 id_ICTRP: CTRI/2014/08/004922
-----------------------
338 id_EU 2009-014770-17
338 id_ICTRP: NTR2070
-----------------------
339 id_EU 2009-014886-21
339 id_ICTRP: NTR2170
-----------------------
340 id_EU 2009-014907-29
340 id_ICTRP: ISRCTN0774

436 id_EU 2010-023175-26
436 id_ICTRP: NTR2716
-----------------------
437 id_EU 2010-023185-42
437 id_ICTRP: NTR2834
-----------------------
438 id_EU 2010-023254-36
438 id_ICTRP: ISRCTN51790058
-----------------------
439 id_EU 2010-023436-16
439 id_ICTRP: NTR2929
-----------------------
440 id_EU 2010-023452-87
440 id_ICTRP: PER-063-11
-----------------------
441 id_EU 2010-023453-11
441 id_ICTRP: PER-063-11
-----------------------
442 id_EU 2010-023469-22
442 id_ICTRP: NTR2577
-----------------------
443 id_EU 2010-023471-26
443 id_ICTRP: CTRI/2011/05/001740
-----------------------
444 id_EU 2010-023507-95
444 id_ICTRP: NTR2617
-----------------------
445 id_EU 2010-023625-38
445 id_ICTRP: ISRCTN81045654
-----------------------
446 id_EU 2010-023636-17
446 id_ICTRP: NTR3359
-----------------------
447 id_EU 2010-023654-37
447 id_ICTRP: NTR3103
-----------------------
448 id_EU 2010-023777-19
448 id_ICTRP: NTR2768
-----------------------
449 id_EU 2010-023969-21
449 id_ICTRP: ISRCTN

544 id_EU 2012-000333-40
544 id_ICTRP: NTR3690
-----------------------
545 id_EU 2012-000344-10
545 id_ICTRP: ISRCTN37666216
-----------------------
546 id_EU 2012-000345-12
546 id_ICTRP: NTR3362
-----------------------
547 id_EU 2012-000406-29
547 id_ICTRP: NTR3473
-----------------------
548 id_EU 2012-000413-36
548 id_ICTRP: ISRCTN79038750
-----------------------
549 id_EU 2012-000425-45
549 id_ICTRP: NTR3526
-----------------------
550 id_EU 2012-000432-26
550 id_ICTRP: NTR4722
-----------------------
551 id_EU 2012-000438-21
551 id_ICTRP: ISRCTN65844716
-----------------------
552 id_EU 2012-000654-55
552 id_ICTRP: ISRCTN80658312
-----------------------
553 id_EU 2012-000671-16
553 id_ICTRP: PER-002-13
-----------------------
554 id_EU 2012-000712-29
554 id_ICTRP: ISRCTN25212012
-----------------------
555 id_EU 2012-000747-26
555 id_ICTRP: ISRCTN44502774
-----------------------
556 id_EU 2012-000793-30
556 id_ICTRP: NTR4720
-----------------------
557 id_EU 2012-000850-75
557 id_

650 id_EU 2013-001823-38
650 id_ICTRP: NTR4430
-----------------------
651 id_EU 2013-001864-50
651 id_ICTRP: NTR4089
-----------------------
652 id_EU 2013-001939-47
652 id_ICTRP: PER-031-14
-----------------------
653 id_EU 2013-002023-41
653 id_ICTRP: NTR4583
-----------------------
654 id_EU 2013-002144-82
654 id_ICTRP: NTR4288
-----------------------
655 id_EU 2013-002166-39
655 id_ICTRP: NTR4460
-----------------------
656 id_EU 2013-002260-10
656 id_ICTRP: NTR4000
-----------------------
657 id_EU 2013-002274-41
657 id_ICTRP: NTR4001
-----------------------
658 id_EU 2013-002512-27
658 id_ICTRP: NTR4253
-----------------------
659 id_EU 2013-002563-25
659 id_ICTRP: NTR4440
-----------------------
660 id_EU 2013-002616-28
660 id_ICTRP: NTR4654
-----------------------
661 id_EU 2013-002663-25
661 id_ICTRP: NTR4031
-----------------------
662 id_EU 2013-002722-23
662 id_ICTRP: NTR3976
-----------------------
663 id_EU 2013-002769-20
663 id_ICTRP: NTR4021
-----------------------
664

761 id_ICTRP: NTR5509
-----------------------
762 id_EU 2014-003932-38
762 id_ICTRP: NTR5820
-----------------------
763 id_EU 2014-004094-17
763 id_ICTRP: NTR5100
-----------------------
764 id_EU 2014-004265-25
764 id_ICTRP: NTR5221
-----------------------
765 id_EU 2014-004432-18
765 id_ICTRP: NTR5115
-----------------------
766 id_EU 2014-004445-26
766 id_ICTRP: ACTRN12614000870651
-----------------------
767 id_EU 2014-004621-40
767 id_ICTRP: NTR4906
-----------------------
768 id_EU 2014-004735-39
768 id_ICTRP: NTR5041
-----------------------
769 id_EU 2014-004739-38
769 id_ICTRP: NTR5106
-----------------------
770 id_EU 2014-005368-13
770 id_ICTRP: CTRI/2017/06/008758
-----------------------
771 id_EU 2014-005375-91
771 id_ICTRP: CTRI/2017/07/009081
-----------------------
772 id_EU 2014-005596-90
772 id_ICTRP: NTR5881
-----------------------
773 id_EU 2014-005639-15
773 id_ICTRP: CTRI/2016/05/006966
-----------------------
774 id_EU 2014-005701-20
774 id_ICTRP: NTR5445
-------

869 id_ICTRP: NTR6709
-----------------------
870 id_EU 2016-003321-42
870 id_ICTRP: NTR6417
-----------------------
871 id_EU 2016-003410-28
871 id_ICTRP: CTRI/2017/09/009732
-----------------------
872 id_EU 2016-003428-21
872 id_ICTRP: NTR6128
-----------------------
873 id_EU 2016-003469-24
873 id_ICTRP: CTRI/2017/11/010582
-----------------------
874 id_EU 2016-003643-10
874 id_ICTRP: NTR6567
-----------------------
875 id_EU 2016-003739-40
875 id_ICTRP: NTR6252
-----------------------
876 id_EU 2016-003818-26
876 id_ICTRP: CTRI/2017/07/009172
-----------------------
877 id_EU 2016-003957-14
877 id_ICTRP: CTRI/2017/11/010561
-----------------------
878 id_EU 2016-004008-71
878 id_ICTRP: NTR6149
-----------------------
879 id_EU 2016-004303-32
879 id_ICTRP: NTR6425
-----------------------
880 id_EU 2016-004433-24
880 id_ICTRP: NTR6591
-----------------------
881 id_EU 2016-004486-37
881 id_ICTRP: NTR6449
-----------------------
882 id_EU 2016-004574-17
882 id_ICTRP: PER-045-11
----

In [254]:
#Sponsor Names
EUprim_ICTRPprim_sponsor_name = list_sponsor_names(list_EU_prim_sponsor_name, list_ICTRP_sponsor_name)
EUsec1_ICTRPprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor1_name, list_ICTRP_sponsor_name)
EUsec2_ICTRPprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor2_name, list_ICTRP_sponsor_name)
EUsec3_ICTRPprim_sponsor_name = list_sponsor_names(list_EU_sec_sponsor3_name, list_ICTRP_sponsor_name)

EUprim_ICTRPprim_sponsor_name[0:2]

[0.88889, 0.65347]

In [255]:
#Sponsor Types
EUprim_ICTRPprim_sponsor_type = list_sponsor_types(list_EU_prim_sponsor_type, list_ICTRP_sponsor_type)
EUsec1_ICTRPprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor1_type, list_ICTRP_sponsor_type)
EUsec2_ICTRPprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor2_type, list_ICTRP_sponsor_type)
EUsec3_ICTRPprim_sponsor_type = list_sponsor_types(list_EU_sec_sponsor3_type, list_ICTRP_sponsor_type)

EUsec2_ICTRPprim_sponsor_type[0:2]

[nan, nan]

In [256]:
#Enrolments
EU_ICTRP_enrol = list_enrolments(list_EU_enrol, list_ICTRP_enrol)
EU_ICTRP_enrol[0:2]

[0.0, 0.0]

In [257]:
#Save the final dataframe

df_eu_ictrp['enrolment_EU'] = list_EU_enrol
df_eu_ictrp['enrolment_ICTRP'] = list_ICTRP_enrol
df_eu_ictrp['enrolment_diff'] = EU_ICTRP_enrol


df_eu_ictrp['spon_ICTRP'] = list_ICTRP_sponsor_name

df_eu_ictrp['primary_spon_EU'] = list_EU_prim_sponsor_name
df_eu_ictrp['prim_spon_similarity'] = EUprim_ICTRPprim_sponsor_name

df_eu_ictrp['secondary_spon1_EU'] = list_EU_sec_sponsor1_name
df_eu_ictrp['sec_spon_similarity1'] = EUsec1_ICTRPprim_sponsor_name

df_eu_ictrp['secondary_spon2_EU'] = list_EU_sec_sponsor2_name
df_eu_ictrp['sec_spon_similarity2'] = EUsec2_ICTRPprim_sponsor_name

df_eu_ictrp['secondary_spon3_EU'] = list_EU_sec_sponsor3_name
df_eu_ictrp['sec_spon_similarity3'] = EUsec3_ICTRPprim_sponsor_name



df_eu_ictrp['spon_type_ICTRP'] = list_ICTRP_sponsor_type

df_eu_ictrp['primary_spon_type_EU'] = list_EU_prim_sponsor_type
df_eu_ictrp['same_prim_spon_type'] = EUprim_ICTRPprim_sponsor_type

df_eu_ictrp['secondary_spon1_type_EU'] = list_EU_sec_sponsor1_type
df_eu_ictrp['same_sec_spon_type1'] = EUsec1_ICTRPprim_sponsor_type

df_eu_ictrp['secondary_spon2_type_EU'] = list_EU_sec_sponsor2_type
df_eu_ictrp['same_sec_spon_type2'] = EUsec2_ICTRPprim_sponsor_type

df_eu_ictrp['secondary_spon3_type_EU'] = list_EU_sec_sponsor3_type
df_eu_ictrp['same_sec_spon_type3'] = EUsec3_ICTRPprim_sponsor_type


In [258]:
df_eu_ictrp[:2]

Unnamed: 0,id_EU,year_EU,long_title_EU,id_ICTRP,year_ICTRP,long_title_ICTRP,title_similarity,enrolment_EU,enrolment_ICTRP,enrolment_diff,...,sec_spon_similarity3,spon_type_ICTRP,primary_spon_type_EU,same_prim_spon_type,secondary_spon1_type_EU,same_sec_spon_type1,secondary_spon2_type_EU,same_sec_spon_type2,secondary_spon3_type_EU,same_sec_spon_type3
0,2004-000629-32,2006.0,Multicentre international study of capecitabin...,ISRCTN45133151,2005.0,Multicentre international study of capecitabin...,1.0,2240.0,2240.0,0.0,...,,Non-Commercial,Non-Commercial,1.0,,,,,,
1,2004-000675-34,2004.0,"A Prospective, Randomized, Controlled, Multi-c...",NTR217,2004.0,"A Prospective, Randomized, Controlled, Multi-c...",0.971429,150.0,150.0,0.0,...,,Non-Commercial,Non-Commercial,1.0,,,,,,


In [30]:
len(df_eu_ictrp)

NameError: name 'df_eu_ictrp' is not defined

In [259]:
df_eu_ictrp.to_excel('../_results/02-sponsors_enrolments/final_eu_ictrp.xlsx')

## 03- Working for ICTRP and US pairs

#### ICTRP_US (ltitles_years) DATA

In [139]:
df_ictrp_us_ltitles_years = pd.read_excel('../_results/01-years_ltitles/Excel/ictrp_us_years_ltitles_list.xlsx')
df_ictrp_us = df_ictrp_us_ltitles_years.rename(index=str, columns={"similarity": "title_similarity"})
df_ictrp_us[0:3]

Unnamed: 0,id_ICTRP,year_ICTRP,long_title_ICTRP,id_US,year_US,long_title_US,title_similarity
0,ACTRN12605000111673,2004.0,HPV DNA testing versus conventional management...,NCT00119509,2004.0,HPV DNA Testing Versus Conventional Management...,0.908571
1,ACTRN12605000116628,2005.0,The effect of 6 month high intensity progressi...,NCT00465660,2005.0,The Effect of 6 Month High Intensity Progressi...,0.900709
2,ACTRN12605000132640,2006.0,"A multi-centre, open-label, randomised study t...",NCT00126308,2005.0,"A Multi-Centre, Open-Label, Randomised Study t...",0.874317


In [140]:
len(df_ictrp_us)

2077

In [142]:
#Create lists of enrolment, sponsor type & name for the pairs 
list_US_sponsor_type = []
list_ICTRP_sponsor_type = []
list_US_sponsor_name = []
list_ICTRP_sponsor_name = []
list_US_enrol = []
list_ICTRP_enrol = []

for i in range(0,len(df_ictrp_us)):
    id_US = df_ictrp_us.id_US[i]
    id_ICTRP = df_ictrp_us.id_ICTRP[i]
    
    for j in range(0,len(df_us)): 
        if (id_US == df_us.trial_id[j]) : 
            
            list_US_sponsor_type.append(df_us.sponsor_type[j])
            list_US_sponsor_name.append(df_us.primary_spons[j])
            list_US_enrol.append(df_us.enrolment[j])
            
            print(i,"id_US",id_US)
            
    for k in range(0,len(df_ictrp)):
        if (id_ICTRP == df_ictrp.trial_id[k]) :
            
            list_ICTRP_sponsor_type.append(df_ictrp.sponsor_type[k])
            list_ICTRP_sponsor_name.append(df_ictrp.primary_spons[k])
            list_ICTRP_enrol.append(df_ictrp.enrolment[k])
            
            print(i,"id_ICTRP:",id_ICTRP)
            print('-----------------------')


0 id_US NCT00119509
0 id_ICTRP: ACTRN12605000111673
-----------------------
1 id_US NCT00465660
1 id_ICTRP: ACTRN12605000116628
-----------------------
2 id_US NCT00126308
2 id_ICTRP: ACTRN12605000132640
-----------------------
3 id_US NCT00273260
3 id_ICTRP: ACTRN12605000139673
-----------------------
4 id_US NCT00763256
4 id_ICTRP: ACTRN12605000260628
-----------------------
5 id_US NCT00139048
5 id_ICTRP: ACTRN12605000293662
-----------------------
6 id_US NCT00226876
6 id_ICTRP: ACTRN12605000307606
-----------------------
7 id_US NCT00124020
7 id_ICTRP: ACTRN12605000324617
-----------------------
8 id_US NCT00107952
8 id_ICTRP: ACTRN12605000324617
-----------------------
9 id_US NCT00118521
9 id_ICTRP: ACTRN12605000463673
-----------------------
10 id_US NCT00456482
10 id_ICTRP: ACTRN12605000485639
-----------------------
11 id_US NCT00468871
11 id_ICTRP: ACTRN12605000485639
-----------------------
12 id_US NCT00192647
12 id_ICTRP: ACTRN12605000488606
-----------------------
13 id_

105 id_ICTRP: ACTRN12611000111976
-----------------------
106 id_US NCT01472575
106 id_ICTRP: ACTRN12611000559910
-----------------------
107 id_US NCT01247350
107 id_ICTRP: ACTRN12611000568910
-----------------------
108 id_US NCT01573182
108 id_ICTRP: ACTRN12611000640909
-----------------------
109 id_US NCT01495832
109 id_ICTRP: ACTRN12611000642987
-----------------------
110 id_US NCT01421108
110 id_ICTRP: ACTRN12611000885998
-----------------------
111 id_US NCT01671787
111 id_ICTRP: ACTRN12611001206910
-----------------------
112 id_US NCT01791387
112 id_ICTRP: ACTRN12612000140853
-----------------------
113 id_US NCT01554085
113 id_ICTRP: ACTRN12612000176864
-----------------------
114 id_US NCT01590407
114 id_ICTRP: ACTRN12612000176864
-----------------------
115 id_US NCT01793610
115 id_ICTRP: ACTRN12612000219886
-----------------------
116 id_US NCT01689740
116 id_ICTRP: ACTRN12612000219886
-----------------------
117 id_US NCT01708720
117 id_ICTRP: ACTRN12612000465853
------

208 id_US NCT02366364
208 id_ICTRP: ACTRN12616000856415
-----------------------
209 id_US NCT03402178
209 id_ICTRP: ACTRN12616000856415
-----------------------
210 id_US NCT03054194
210 id_ICTRP: ACTRN12616000856415
-----------------------
211 id_US NCT02862548
211 id_ICTRP: ACTRN12616000898459
-----------------------
212 id_US NCT02205424
212 id_ICTRP: ACTRN12616000970448
-----------------------
213 id_US NCT02353494
213 id_ICTRP: ACTRN12616000978460
-----------------------
214 id_US NCT02971423
214 id_ICTRP: ACTRN12616000995471
-----------------------
215 id_US NCT02528903
215 id_ICTRP: ACTRN12616001040459
-----------------------
216 id_US NCT02538874
216 id_ICTRP: ACTRN12616001040459
-----------------------
217 id_US NCT03335553
217 id_ICTRP: ACTRN12616001040459
-----------------------
218 id_US NCT03276728
218 id_ICTRP: ACTRN12616001040459
-----------------------
219 id_US NCT03199313
219 id_ICTRP: ACTRN12616001040459
-----------------------
220 id_US NCT02790125
220 id_ICTRP: ACTR

311 id_US NCT02884726
311 id_ICTRP: ACTRN12617001641381
-----------------------
312 id_US NCT00680940
312 id_ICTRP: CTRI/2007/091/000037
-----------------------
313 id_US NCT00547404
313 id_ICTRP: CTRI/2008/091/000004
-----------------------
314 id_US NCT00882063
314 id_ICTRP: CTRI/2008/091/000004
-----------------------
315 id_US NCT00609817
315 id_ICTRP: CTRI/2008/091/000004
-----------------------
316 id_US NCT00652613
316 id_ICTRP: CTRI/2008/091/000045
-----------------------
317 id_US NCT02425670
317 id_ICTRP: CTRI/2008/091/000046
-----------------------
318 id_US NCT00666133
318 id_ICTRP: CTRI/2008/091/000118
-----------------------
319 id_US NCT00840190
319 id_ICTRP: CTRI/2008/091/000236
-----------------------
320 id_US NCT00772876
320 id_ICTRP: CTRI/2008/091/000236
-----------------------
321 id_US NCT01117337
321 id_ICTRP: CTRI/2009/091/000020
-----------------------
322 id_US NCT00899054
322 id_ICTRP: CTRI/2009/091/000115
-----------------------
323 id_US NCT00783718
323 id_

413 id_US NCT01644448
413 id_ICTRP: CTRI/2012/04/002549
-----------------------
414 id_US NCT01644435
414 id_ICTRP: CTRI/2012/04/002549
-----------------------
415 id_US NCT01643629
415 id_ICTRP: CTRI/2012/04/002549
-----------------------
416 id_US NCT02612974
416 id_ICTRP: CTRI/2012/04/002564
-----------------------
417 id_US NCT01668862
417 id_ICTRP: CTRI/2012/04/002611
-----------------------
418 id_US NCT01644435
418 id_ICTRP: CTRI/2012/04/002611
-----------------------
419 id_US NCT01643629
419 id_ICTRP: CTRI/2012/04/002611
-----------------------
420 id_US NCT01643629
420 id_ICTRP: CTRI/2012/04/002613
-----------------------
421 id_US NCT01668862
421 id_ICTRP: CTRI/2012/04/002614
-----------------------
422 id_US NCT01644435
422 id_ICTRP: CTRI/2012/04/002614
-----------------------
423 id_US NCT01643629
423 id_ICTRP: CTRI/2012/04/002614
-----------------------
424 id_US NCT01587144
424 id_ICTRP: CTRI/2012/04/002616
-----------------------
425 id_US NCT01309945
425 id_ICTRP: CTRI

516 id_US NCT01865656
516 id_ICTRP: CTRI/2014/12/005275
-----------------------
517 id_US NCT02394808
517 id_ICTRP: CTRI/2014/12/005328
-----------------------
518 id_US NCT02746991
518 id_ICTRP: CTRI/2014/12/005337
-----------------------
519 id_US NCT02438982
519 id_ICTRP: CTRI/2015/01/005364
-----------------------
520 id_US NCT02382575
520 id_ICTRP: CTRI/2015/01/005364
-----------------------
521 id_US NCT02595814
521 id_ICTRP: CTRI/2015/01/005389
-----------------------
522 id_US NCT02236793
522 id_ICTRP: CTRI/2015/01/005410
-----------------------
523 id_US NCT01774019
523 id_ICTRP: CTRI/2015/01/005437
-----------------------
524 id_US NCT02587286
524 id_ICTRP: CTRI/2015/02/005560
-----------------------
525 id_US NCT01884337
525 id_ICTRP: CTRI/2015/03/005598
-----------------------
526 id_US NCT02511379
526 id_ICTRP: CTRI/2015/03/005658
-----------------------
527 id_US NCT02380248
527 id_ICTRP: CTRI/2015/03/005658
-----------------------
528 id_US NCT01377844
528 id_ICTRP: CTRI

619 id_US NCT02906930
619 id_ICTRP: CTRI/2017/06/008830
-----------------------
620 id_US NCT03405064
620 id_ICTRP: CTRI/2017/06/008843
-----------------------
621 id_US NCT03198260
621 id_ICTRP: CTRI/2017/06/008875
-----------------------
622 id_US NCT03054987
622 id_ICTRP: CTRI/2017/06/008890
-----------------------
623 id_US NCT03195920
623 id_ICTRP: CTRI/2017/07/008974
-----------------------
624 id_US NCT02542865
624 id_ICTRP: CTRI/2017/07/008979
-----------------------
625 id_US NCT03123653
625 id_ICTRP: CTRI/2017/07/008991
-----------------------
626 id_US NCT03145168
626 id_ICTRP: CTRI/2017/07/009001
-----------------------
627 id_US NCT03130114
627 id_ICTRP: CTRI/2017/07/009017
-----------------------
628 id_US NCT03136484
628 id_ICTRP: CTRI/2017/07/009081
-----------------------
629 id_US NCT02648204
629 id_ICTRP: CTRI/2017/07/009081
-----------------------
630 id_US NCT03155269
630 id_ICTRP: CTRI/2017/07/009084
-----------------------
631 id_US NCT03219710
631 id_ICTRP: CTRI

722 id_US NCT03120754
722 id_ICTRP: ChiCTR-IOR-17011079
-----------------------
723 id_US NCT03218657
723 id_ICTRP: ChiCTR-IOR-17011653
-----------------------
724 id_US NCT03216993
724 id_ICTRP: ChiCTR-IOR-17011914
-----------------------
725 id_US NCT03262090
725 id_ICTRP: ChiCTR-IOR-17012415
-----------------------
726 id_US NCT03258749
726 id_ICTRP: ChiCTR-IOR-17012417
-----------------------
727 id_US NCT03295708
727 id_ICTRP: ChiCTR-IOR-17012697
-----------------------
728 id_US NCT03376152
728 id_ICTRP: ChiCTR-IOR-17013373
-----------------------
729 id_US NCT02409745
729 id_ICTRP: ChiCTR-IPC-14005419
-----------------------
730 id_US NCT02348229
730 id_ICTRP: ChiCTR-IPC-15005778
-----------------------
731 id_US NCT02644876
731 id_ICTRP: ChiCTR-IPC-15006603
-----------------------
732 id_US NCT02562183
732 id_ICTRP: ChiCTR-IPC-15007039
-----------------------
733 id_US NCT02806128
733 id_ICTRP: ChiCTR-IPD-16008508
-----------------------
734 id_US NCT02331745
734 id_ICTRP: ChiC

825 id_US NCT02313415
825 id_ICTRP: ChiCTR-OPC-14005553
-----------------------
826 id_US NCT02559830
826 id_ICTRP: ChiCTR-OPC-15005802
-----------------------
827 id_US NCT02126228
827 id_ICTRP: ChiCTR-OPC-15005817
-----------------------
828 id_US NCT02073058
828 id_ICTRP: ChiCTR-OPC-15006147
-----------------------
829 id_US NCT02520570
829 id_ICTRP: ChiCTR-OPC-15006751
-----------------------
830 id_US NCT02506634
830 id_ICTRP: ChiCTR-OPC-15006780
-----------------------
831 id_US NCT02868060
831 id_ICTRP: ChiCTR-OPC-15007012
-----------------------
832 id_US NCT02631109
832 id_ICTRP: ChiCTR-OPC-15007481
-----------------------
833 id_US NCT03045471
833 id_ICTRP: ChiCTR-OPC-16010056
-----------------------
834 id_US NCT03181815
834 id_ICTRP: ChiCTR-OPC-16010166
-----------------------
835 id_US NCT03029013
835 id_ICTRP: ChiCTR-OPC-17010463
-----------------------
836 id_US NCT02754570
836 id_ICTRP: ChiCTR-OPC-17013229
-----------------------
837 id_US NCT03380039
837 id_ICTRP: ChiC

928 id_ICTRP: DRKS00003341
-----------------------
929 id_US NCT01522794
929 id_ICTRP: DRKS00003478
-----------------------
930 id_US NCT01566955
930 id_ICTRP: DRKS00003582
-----------------------
931 id_US NCT02356770
931 id_ICTRP: DRKS00003586
-----------------------
932 id_US NCT01609153
932 id_ICTRP: DRKS00003603
-----------------------
933 id_US NCT00072462
933 id_ICTRP: DRKS00003649
-----------------------
934 id_US NCT00749723
934 id_ICTRP: DRKS00003778
-----------------------
935 id_US NCT00020566
935 id_ICTRP: DRKS00003782
-----------------------
936 id_US NCT00895674
936 id_ICTRP: DRKS00004113
-----------------------
937 id_US NCT01879774
937 id_ICTRP: DRKS00004278
-----------------------
938 id_US NCT01702545
938 id_ICTRP: DRKS00004392
-----------------------
939 id_US NCT01639014
939 id_ICTRP: DRKS00004394
-----------------------
940 id_US NCT00287105
940 id_ICTRP: DRKS00005309
-----------------------
941 id_US NCT00047112
941 id_ICTRP: DRKS00005547
-----------------------


1035 id_US NCT02136329
1035 id_ICTRP: ISRCTN06134609
-----------------------
1036 id_US NCT00978315
1036 id_ICTRP: ISRCTN07270894
-----------------------
1037 id_US NCT01790984
1037 id_ICTRP: ISRCTN09001687
-----------------------
1038 id_US NCT01954693
1038 id_ICTRP: ISRCTN09739757
-----------------------
1039 id_US NCT02980367
1039 id_ICTRP: ISRCTN10110685
-----------------------
1040 id_US NCT02696174
1040 id_ICTRP: ISRCTN10261528
-----------------------
1041 id_US NCT01808898
1041 id_ICTRP: ISRCTN10687405
-----------------------
1042 id_US NCT00555386
1042 id_ICTRP: ISRCTN11023622
-----------------------
1043 id_US NCT00210353
1043 id_ICTRP: ISRCTN11144129
-----------------------
1044 id_US NCT02721498
1044 id_ICTRP: ISRCTN11220163
-----------------------
1045 id_US NCT02894008
1045 id_ICTRP: ISRCTN11285604
-----------------------
1046 id_US NCT03170622
1046 id_ICTRP: ISRCTN11314352
-----------------------
1047 id_US NCT03052036
1047 id_ICTRP: ISRCTN11343602
-----------------------

1142 id_US NCT02139904
1142 id_ICTRP: ISRCTN44518069
-----------------------
1143 id_US NCT02687087
1143 id_ICTRP: ISRCTN44528835
-----------------------
1144 id_US NCT02539992
1144 id_ICTRP: ISRCTN44765636
-----------------------
1145 id_US NCT00103116
1145 id_ICTRP: ISRCTN45563569
-----------------------
1146 id_US NCT02835391
1146 id_ICTRP: ISRCTN45856014
-----------------------
1147 id_US NCT02759783
1147 id_ICTRP: ISRCTN45961438
-----------------------
1148 id_US NCT01603407
1148 id_ICTRP: ISRCTN46102316
-----------------------
1149 id_US NCT02197299
1149 id_ICTRP: ISRCTN46349186
-----------------------
1150 id_US NCT01999998
1150 id_ICTRP: ISRCTN46587767
-----------------------
1151 id_US NCT02059655
1151 id_ICTRP: ISRCTN46696624
-----------------------
1152 id_US NCT01181856
1152 id_ICTRP: ISRCTN46804531
-----------------------
1153 id_US NCT01557153
1153 id_ICTRP: ISRCTN46911260
-----------------------
1154 id_US NCT03278847
1154 id_ICTRP: ISRCTN47404296
-----------------------

1249 id_US NCT01444274
1249 id_ICTRP: ISRCTN99031557
-----------------------
1250 id_US NCT01288456
1250 id_ICTRP: ISRCTN99031557
-----------------------
1251 id_US NCT01171339
1251 id_ICTRP: ISRCTN99526053
-----------------------
1252 id_US NCT01171339
1252 id_ICTRP: ISRCTN99691973
-----------------------
1253 id_US NCT00969800
1253 id_ICTRP: JPRN-JMA-IIA00033
-----------------------
1254 id_US NCT02835742
1254 id_ICTRP: JPRN-JMA-IIA00205
-----------------------
1255 id_US NCT02471586
1255 id_ICTRP: JPRN-JMA-IIA00211
-----------------------
1256 id_US NCT00144495
1256 id_ICTRP: JPRN-JapicCTI-050012
-----------------------
1257 id_US NCT00144482
1257 id_ICTRP: JPRN-JapicCTI-050013
-----------------------
1258 id_US NCT00229177
1258 id_ICTRP: JPRN-JapicCTI-050098
-----------------------
1259 id_US NCT00530075
1259 id_ICTRP: JPRN-JapicCTI-050109
-----------------------
1260 id_US NCT00691028
1260 id_ICTRP: JPRN-JapicCTI-050146
-----------------------
1261 id_US NCT00346047
1261 id_ICTRP:

1348 id_ICTRP: JPRN-JapicCTI-101354
-----------------------
1349 id_US NCT01251796
1349 id_ICTRP: JPRN-JapicCTI-101355
-----------------------
1350 id_US NCT01069757
1350 id_ICTRP: JPRN-JapicCTI-101355
-----------------------
1351 id_US NCT01436084
1351 id_ICTRP: JPRN-JapicCTI-101374
-----------------------
1352 id_US NCT00417391
1352 id_ICTRP: JPRN-JapicCTI-111460
-----------------------
1353 id_US NCT01326689
1353 id_ICTRP: JPRN-JapicCTI-111462
-----------------------
1354 id_US NCT01231347
1354 id_ICTRP: JPRN-JapicCTI-111468
-----------------------
1355 id_US NCT01344876
1355 id_ICTRP: JPRN-JapicCTI-111478
-----------------------
1356 id_US NCT01290471
1356 id_ICTRP: JPRN-JapicCTI-111484
-----------------------
1357 id_US NCT01088464
1357 id_ICTRP: JPRN-JapicCTI-111490
-----------------------
1358 id_US NCT00778128
1358 id_ICTRP: JPRN-JapicCTI-111490
-----------------------
1359 id_US NCT01005355
1359 id_ICTRP: JPRN-JapicCTI-111490
-----------------------
1360 id_US NCT00778128
1360

1447 id_US NCT02069119
1447 id_ICTRP: JPRN-JapicCTI-142448
-----------------------
1448 id_US NCT02085460
1448 id_ICTRP: JPRN-JapicCTI-142467
-----------------------
1449 id_US NCT00693004
1449 id_ICTRP: JPRN-JapicCTI-142499
-----------------------
1450 id_US NCT02079909
1450 id_ICTRP: JPRN-JapicCTI-142499
-----------------------
1451 id_US NCT01746979
1451 id_ICTRP: JPRN-JapicCTI-142511
-----------------------
1452 id_US NCT01818752
1452 id_ICTRP: JPRN-JapicCTI-142523
-----------------------
1453 id_US NCT02143271
1453 id_ICTRP: JPRN-JapicCTI-142537
-----------------------
1454 id_US NCT02166697
1454 id_ICTRP: JPRN-JapicCTI-142567
-----------------------
1455 id_US NCT02168920
1455 id_ICTRP: JPRN-JapicCTI-142578
-----------------------
1456 id_US NCT02276274
1456 id_ICTRP: JPRN-JapicCTI-142584
-----------------------
1457 id_US NCT02192567
1457 id_ICTRP: JPRN-JapicCTI-142586
-----------------------
1458 id_US NCT02160951
1458 id_ICTRP: JPRN-JapicCTI-142589
-----------------------
1459

1546 id_US NCT02394483
1546 id_ICTRP: JPRN-JapicCTI-163466
-----------------------
1547 id_US NCT03030066
1547 id_ICTRP: JPRN-JapicCTI-163479
-----------------------
1548 id_US NCT03018691
1548 id_ICTRP: JPRN-JapicCTI-173484
-----------------------
1549 id_US NCT03155061
1549 id_ICTRP: JPRN-JapicCTI-173496
-----------------------
1550 id_US NCT03055962
1550 id_ICTRP: JPRN-JapicCTI-173504
-----------------------
1551 id_US NCT03048747
1551 id_ICTRP: JPRN-JapicCTI-173512
-----------------------
1552 id_US NCT03092765
1552 id_ICTRP: JPRN-JapicCTI-173515
-----------------------
1553 id_US NCT03157635
1553 id_ICTRP: JPRN-JapicCTI-173517
-----------------------
1554 id_US NCT03106623
1554 id_ICTRP: JPRN-JapicCTI-173535
-----------------------
1555 id_US NCT03096223
1555 id_ICTRP: JPRN-JapicCTI-173543
-----------------------
1556 id_US NCT01982955
1556 id_ICTRP: JPRN-JapicCTI-173546
-----------------------
1557 id_US NCT03117049
1557 id_ICTRP: JPRN-JapicCTI-173560
-----------------------
1558

1652 id_ICTRP: KCT0001058
-----------------------
1653 id_US NCT01880502
1653 id_ICTRP: KCT0001063
-----------------------
1654 id_US NCT02114359
1654 id_ICTRP: KCT0001069
-----------------------
1655 id_US NCT02162316
1655 id_ICTRP: KCT0001100
-----------------------
1656 id_US NCT01973218
1656 id_ICTRP: KCT0001103
-----------------------
1657 id_US NCT01808573
1657 id_ICTRP: KCT0001143
-----------------------
1658 id_US NCT01872065
1658 id_ICTRP: KCT0001179
-----------------------
1659 id_US NCT02322606
1659 id_ICTRP: KCT0001179
-----------------------
1660 id_US NCT02114268
1660 id_ICTRP: KCT0001179
-----------------------
1661 id_US NCT01983358
1661 id_ICTRP: KCT0001179
-----------------------
1662 id_US NCT02168595
1662 id_ICTRP: KCT0001179
-----------------------
1663 id_US NCT02162199
1663 id_ICTRP: KCT0001179
-----------------------
1664 id_US NCT02175056
1664 id_ICTRP: KCT0001184
-----------------------
1665 id_US NCT01959074
1665 id_ICTRP: KCT0001194
-----------------------
1

1766 id_US NCT01261689
1766 id_ICTRP: NTR1744
-----------------------
1767 id_US NCT01292681
1767 id_ICTRP: NTR1764
-----------------------
1768 id_US NCT00015873
1768 id_ICTRP: NTR182
-----------------------
1769 id_US NCT01027585
1769 id_ICTRP: NTR1821
-----------------------
1770 id_US NCT00028717
1770 id_ICTRP: NTR184
-----------------------
1771 id_US NCT00940355
1771 id_ICTRP: NTR1844
-----------------------
1772 id_US NCT01139489
1772 id_ICTRP: NTR1861
-----------------------
1773 id_US NCT01014117
1773 id_ICTRP: NTR1971
-----------------------
1774 id_US NCT01317485
1774 id_ICTRP: NTR2037
-----------------------
1775 id_US NCT01101347
1775 id_ICTRP: NTR2054
-----------------------
1776 id_US NCT01111253
1776 id_ICTRP: NTR2069
-----------------------
1777 id_US NCT01134068
1777 id_ICTRP: NTR2163
-----------------------
1778 id_US NCT01820585
1778 id_ICTRP: NTR2167
-----------------------
1779 id_US NCT01088477
1779 id_ICTRP: NTR2234
-----------------------
1780 id_US NCT00220610

1883 id_ICTRP: NTR5136
-----------------------
1884 id_US NCT02498106
1884 id_ICTRP: NTR5176
-----------------------
1885 id_US NCT02376309
1885 id_ICTRP: NTR5290
-----------------------
1886 id_US NCT02602925
1886 id_ICTRP: NTR5301
-----------------------
1887 id_US NCT02200302
1887 id_ICTRP: NTR5302
-----------------------
1888 id_US NCT03071471
1888 id_ICTRP: NTR5302
-----------------------
1889 id_US NCT02285790
1889 id_ICTRP: NTR5305
-----------------------
1890 id_US NCT02715141
1890 id_ICTRP: NTR5331
-----------------------
1891 id_US NCT00738218
1891 id_ICTRP: NTR535
-----------------------
1892 id_US NCT02268357
1892 id_ICTRP: NTR5364
-----------------------
1893 id_US NCT02271828
1893 id_ICTRP: NTR5429
-----------------------
1894 id_US NCT02953756
1894 id_ICTRP: NTR5462
-----------------------
1895 id_US NCT02271828
1895 id_ICTRP: NTR5493
-----------------------
1896 id_US NCT02508025
1896 id_ICTRP: NTR5494
-----------------------
1897 id_US NCT02594683
1897 id_ICTRP: NTR553

1991 id_ICTRP: PACTR201709002541488
-----------------------
1992 id_US NCT03301025
1992 id_ICTRP: PACTR201710002516280
-----------------------
1993 id_US NCT02576574
1993 id_ICTRP: PER-004-16
-----------------------
1994 id_US NCT01515007
1994 id_ICTRP: PER-006-15
-----------------------
1995 id_US NCT02358031
1995 id_ICTRP: PER-016-15
-----------------------
1996 id_US NCT01703208
1996 id_ICTRP: PER-017-13
-----------------------
1997 id_US NCT02403674
1997 id_ICTRP: PER-022-15
-----------------------
1998 id_US NCT02131233
1998 id_ICTRP: PER-031-14
-----------------------
1999 id_US NCT02531438
1999 id_ICTRP: PER-039-15
-----------------------
2000 id_US NCT03161483
2000 id_ICTRP: PER-045-11
-----------------------
2001 id_US NCT02104674
2001 id_ICTRP: PER-045-15
-----------------------
2002 id_US NCT02260804
2002 id_ICTRP: PER-057-15
-----------------------
2003 id_US NCT02220894
2003 id_ICTRP: PER-061-14
-----------------------
2004 id_US NCT03161483
2004 id_ICTRP: PER-062-15
-----

In [178]:
list_US_id = ['NCT00119509','NCT00465660','NCT00126308','NCT00273260','NCT00119509']

In [177]:
df3 = df_us[df_us['trial_id'].isin(list_US_id)] 
df3[:2]

Unnamed: 0,trial_id,primary_spons,sponsor_type,enrolment
36593,NCT00465660,University of Sydney,Non-Commercial,60.0
90113,NCT00273260,Menzies Institute for Medical Research,Non-Commercial,400.0


In [143]:
ICTRP_US_sponsor_type = list_sponsor_types(list_US_sponsor_type, list_ICTRP_sponsor_type)
ICTRP_US_sponsor_type[0:2]

[1, 1]

In [148]:
ICTRP_US_sponsor_name = list_sponsor_names(list_US_sponsor_name, list_ICTRP_sponsor_name)
ICTRP_US_sponsor_name[0:2]

[0.78431, 0.34188]

In [149]:
ICTRP_US_enrol = list_enrolments(list_US_enrol, list_ICTRP_enrol)
ICTRP_US_enrol[0:2]

[0.0, 0.0]

In [150]:
#Save the final dataframe
df_ictrp_us['spon_type_ICTRP'] = list_ICTRP_sponsor_type
df_ictrp_us['spon_type_US'] = list_US_sponsor_type
df_ictrp_us['same_spon_type'] = ICTRP_US_sponsor_type

df_ictrp_us['sponsor_ICTRP'] = list_ICTRP_sponsor_name
df_ictrp_us['sponsor_US'] = list_US_sponsor_name
df_ictrp_us['spon_similarity'] = ICTRP_US_sponsor_name

df_ictrp_us['enrolment_ICTRP'] = list_ICTRP_enrol
df_ictrp_us['enrolment_US'] = list_US_enrol
df_ictrp_us['enrolment_diff'] = ICTRP_US_enrol

In [151]:
df_ictrp_us[0:2]

Unnamed: 0,id_ICTRP,year_ICTRP,long_title_ICTRP,id_US,year_US,long_title_US,title_similarity,spon_type_ICTRP,spon_type_US,same_spon_type,sponsor_ICTRP,sponsor_US,spon_similarity,enrolment_ICTRP,enrolment_US,enrolment_diff
0,ACTRN12605000111673,2004.0,HPV DNA testing versus conventional management...,NCT00119509,2004.0,HPV DNA Testing Versus Conventional Management...,0.908571,Non-Commercial,Non-Commercial,1,University University of Sydney,University of Sydney,0.78431,300.0,300.0,0
1,ACTRN12605000116628,2005.0,The effect of 6 month high intensity progressi...,NCT00465660,2005.0,The Effect of 6 Month High Intensity Progressi...,0.900709,Non-Commercial,Non-Commercial,1,"University University of Sydney, Faculty of He...",University of Sydney,0.34188,60.0,60.0,0


In [152]:
df_ictrp_us.to_excel('../_results/02-sponsors_enrolments/final_ictrp_us.xlsx')

### Save the final pickles

In [23]:
with open('../_results/02-sponsors_enrolments/final_eu_us.pickle', 'wb') as f:
    pickle.dump(df_eu_us, f) 

In [24]:
with open('../_results/02-sponsors_enrolments/final_eu_ictrp.pickle', 'wb') as f:
    pickle.dump(df_eu_ictrp, f) 

In [None]:
with open('../_results/02-sponsors_enrolments/final_ictrp_us.pickle', 'wb') as f:
    pickle.dump(df_ictrp_us, f) 