In [1]:
import pandas as pd
import csv
import re
import string
from functools import reduce
from difflib import SequenceMatcher

### US_data

In [288]:
### us_years
df_us_years = pd.read_csv('../trial_years/us_year.csv', sep='|')
# Determining duplicates
mask = df_us_years.duplicated(keep=False)
double = df_us_years[mask]  #len(double) = 0 ==> no duplicates in df_us_years

### us_titles
df_us_titles = pd.read_csv('../trial_title/us_title.csv', sep='|')
# Determining duplicates
mask = df_us_titles.duplicated(keep=False)
double = df_us_titles[mask]  #len(double) = 0 ==> no duplicates in df_us_titles

### merge us_years with us_titles
df_us = pd.merge(df_us_years, df_us_titles, on='trial_id')
df_us[0:3]

Unnamed: 0,trial_id,start_year,short_title,long_title
0,NCT03260985,2017.0,Precision Psychiatry Continuity Clinic Project,Precision Psychiatry Continuity Clinic Project
1,NCT03268473,2017.0,Effect of Non-surgical Periodontal Treatment o...,Effect of Non-surgical Periodontal Treatment o...
2,NCT03262610,,Setemelanotide in a Single Patient With Partia...,Expanded-access for the Use of Setemelanotide ...


In [289]:
len(df_us)

268859

### EU_data

In [290]:
### eu_years
df_eu_years = pd.read_csv('../trial_years/eu_year.csv', sep='|')
df_eu_years = df_eu_years.drop(['id','EU','US','ICTRP','Japan','ANZCTR','Iran','China','India','Africa','Korea','Brazil','Thai','Peru','Cuba','SriLanka','check'], axis=1)
# Determining duplicates
mask = df_eu_years.duplicated(keep=False)
double = df_eu_years[mask]  #len(double) = 0 ==> no duplicates in df_eu_years


### eu_titles
df_eu_titles = pd.read_csv('../trial_title/eu_title.csv', sep='|')
# Determining duplicates
mask = df_eu_titles.duplicated(keep=False)
double = df_eu_titles[mask]  #len(double) = 0 ==> no duplicates in df_us_titles

### merge eu_years with eu_titles
df_eu = pd.merge(df_eu_years, df_eu_titles, on='trial_id')
df_eu[0:3]


Unnamed: 0,trial_id,start_year,long_title
0,2004-000007-18,2004.0,"A Multicentre, Randomised, Double-Blind, Paral..."
1,2004-000012-13,2004.0,"A Double-Blind, Placebo-Controlled, Parallel, ..."
2,2004-000015-25,2004.0,"A phase 3 randomized, placebo-controlled, doub..."


In [291]:
len(df_eu)

29588

### ICTRP_data

In [292]:
### ictrp_years
df_ictrp_years = pd.read_csv('../trial_years/ictrp_year.csv', sep='|')
df_ictrp_years = df_ictrp_years.drop(['id','EU','US','ICTRP','Japan','ANZCTR','Iran','China','India','Africa','Korea','Brazil','Thai','Peru','Cuba','SriLanka','check'], axis=1)
# Determining duplicates
mask = df_ictrp_years.duplicated(keep=False)
double = df_ictrp_years[mask]  #len(double) = 0 ==> no duplicates in df_ictrp_years


### ictrp_titles
df_ictrp_titles = pd.read_csv('../trial_title/ictrp_title.csv', sep='|')
# Determining duplicates
mask = df_ictrp_titles.duplicated(keep=False)
double = df_ictrp_titles[mask]  #len(double) = 0 ==> no duplicates in df_ictrp_titles

### merge ictrp_years with ictrp_titles
df_ictrp = pd.merge(df_ictrp_years, df_ictrp_titles, on='trial_id')
df_ictrp[0:3]


Unnamed: 0,trial_id,start_year,short_title,long_title
0,ACTRN12605000058673,2005.0,Magnesium in Aneurysmal Subarachnoid Haemorrhage,"A multi-centre, single blinded, randomised con..."
1,ACTRN12605000059662,2005.0,"Multicentre, Unblinded, Randomised, Controlled...","Multicentre, Unblinded, Randomised, Controlled..."
2,ACTRN12605000060640,2003.0,A trial of G-CSF in septic shock excluding mel...,A single centre double blinded randomised cont...


In [293]:
len(df_ictrp)

108931

### 1 - Divide the data where the long_title/short_title infos EXIST or NOT

First, we noticed that there are some data where the "long_title" and "short_title" infos are missing, and this will not help in the comparison, 
we will then divide the dfs data into 2 sub_dfs : df_complete & df_with_nan

In [294]:
# US
df_us_complete = df_us.dropna(subset=['long_title','short_title'])
df_us_with_nan = df_us.merge(df_us_complete, how='outer', indicator=True).query('_merge == "left_only"').drop('_merge', 1)

print('--- FOR US ---')
print('US data with complete info :', len(df_us_complete))
print('US data with missing info :', len(df_us_with_nan)) 

--- FOR US ---
US data with complete info : 258398
US data with missing info : 10461


In [295]:
# EU
df_eu_complete = df_eu.dropna(subset=['long_title'])
df_eu_with_nan = df_eu.merge(df_eu_complete, how='outer', indicator=True).query('_merge == "left_only"').drop('_merge', 1)

print('--- FOR EU ---')
print('EU data with complete info :', len(df_eu_complete))
print('EU data with missing info :', len(df_eu_with_nan)) 

--- FOR EU ---
EU data with complete info : 29399
EU data with missing info : 189


In [297]:
# ICTRP
df_ictrp_complete = df_ictrp.dropna(subset=['long_title', 'short_title'])
df_ictrp_with_nan = df_ictrp.merge(df_ictrp_complete, how='outer', indicator=True).query('_merge == "left_only"').drop('_merge', 1)

print('--- FOR ICTRP ---')
print('ICTRP data with complete info :', len(df_ictrp_complete))
print('ICTRP data with missing info :', len(df_ictrp_with_nan)) 

--- FOR ICTRP ---
ICTRP data with complete info : 73704
ICTRP data with missing info : 35227


In [166]:
df_us_with_nan[0:2]

Unnamed: 0,trial_id,start_year,short_title,long_title
24,NCT03267225,2016.0,Study on the Genetic Determinants of Clindamyc...,
578,NCT03260153,2017.0,Deproteinised Calf Blood Serum Injection for t...,


In [285]:
len(df_ictrp_complete)

73704

In [None]:
#save the data
df_us_complete.to_csv('../data/us_ltitles_complete.csv',sep='|')
df_eu_complete.to_csv('../data/eu_ltitles_complete.csv',sep='|')
df_ictrp_complete.to_csv('../data/ictrp_ltitles_complete.csv',sep='|')

### 2 - Create lists: (trial_id, start_year, long_title)

In [167]:
#US
list_us = df_us_complete[['trial_id','start_year', 'long_title']].values.tolist()
list_us[0:1]

[['NCT03260985', 2017.0, 'Precision Psychiatry Continuity Clinic Project']]

In [168]:
#EU
list_eu = df_eu_complete[['trial_id','start_year', 'long_title']].values.tolist()
list_eu[0:1]

[['2004-000007-18',
  2004.0,
  'A Multicentre, Randomised, Double-Blind, Parallel Group, 24 Week Study to Compare the Effect of the Salmeterol/Fluticasone Propionate Combination Product (SERETIDE) 50/250mcg with Salmeterol 50mcg Both Delivered Twice Daily via the DISKUS/ACCUHALER Inhaler on Lung Function and Dyspnoea in Subjects With Chronic Obstructive Pulmonary Disease (COPD).']]

In [169]:
#ICTRP
list_ictrp = df_ictrp_complete[['trial_id','start_year', 'long_title']].values.tolist()
list_ictrp[0:1]

[['ACTRN12605000058673',
  2005.0,
  'A multi-centre, single blinded, randomised controlled trial of two target ranges for serum magnesium in patients with Subarachnoid Haemorrhage to reduce vasospasm']]

### 3 - Functions

In [2]:
### calculate similarity with "SequenceMatcher" where space isJunk 
def similar(a, b):
    return SequenceMatcher(lambda x: x == " ", a.lower(), b.lower()).ratio()

In [4]:
### LIST COMBINATIONS with calculating the difference in the years and titles length
def lists_combinations(list1,list2):
    combination_list = []
    #list = [id,year,title]
    for i in list1:
        for j in list2:
            
            id1 = str(i[0])
            id2 = str(j[0])
            year1 = i[1]
            year2 = j[1]
            title1 = i[2]
            title2 = j[2]
            
            ltitle1 = re.escape(title1).lower()
            ltitle1 = ltitle1.translate(str.maketrans('','',string.punctuation + string.whitespace))
            ltitle2 = re.escape(title2).lower()
            ltitle2 = ltitle2.translate(str.maketrans('','',string.punctuation + string.whitespace))
            
            sim = similar(ltitle1,ltitle2)
            
            if (pd.isnull(year1)) or (pd.isnull(year2)):
                
                if ((ltitle1 in ltitle2) or (ltitle2 in ltitle1) or (sim > 0.8)) :
                #or (sim > 0.22) After testing, we found that when the similarity is 0.8 and above, the two titles are almost identical
                 
                    new_tuple = (id1,year1,title1,id2,year2,title2,sim)
                    combination_list.append(new_tuple) 
                    
            else:
                
                diff = abs(year1 - year2)
                if ((diff < 2) and ((ltitle1 in ltitle2) or (ltitle2 in ltitle1)or (sim > 0.8))) :
                    
                    new_tuple = (id1,year1,title1,id2,year2,title2,sim)
                    combination_list.append(new_tuple) 

    return(combination_list)

### TEST

In [216]:
### eu_ictrp
eu = list_eu[0:15]
ictrp = list_ictrp[6000:6000]
eu_ictrp = lists_combinations(eu,ictrp) 
df_eu_ictrp = pd.DataFrame(eu_ictrp, columns=['id_EU','year_EU','long_title_EU','id_ICTRP','year_ICTRP','long_title_ICTRP','similarity'])
df_eu_ictrp

Unnamed: 0,id_EU,year_EU,long_title_EU,id_ICTRP,year_ICTRP,long_title_ICTRP,similarity


In [60]:
print(df_eu_ictrp.long_title_EU[0])
print("////////////")
print(df_eu_ictrp.long_title_ICTRP[0])

A Randomized, Double-Blind, Placebo-Controlled, Phase 3 Study of the Safety and Efficacy of Interferon gamma-Ib in patients with Idiopathic Pulmonary Fibrosis
////////////
A Randomized, Double-Blind, Placebo-Controlled Pharmacokinetic Study of the Chrono Nicotine Replacement Therapy System for adult male smokers


In [281]:
### us_ictrp
us = list_us[300:400]
ictrp = list_ictrp[7500:7750]
us_ictrp = lists_combinations(us,ictrp) 
df_us_ictrp = pd.DataFrame(us_ictrp, columns=['id_US','year_US','long_title_US','id_ICTRP','year_ICTRP','long_title_ICTRP','similarity'])
df_us_ictrp


Unnamed: 0,id_US,year_US,long_title_US,id_ICTRP,year_ICTRP,long_title_ICTRP,similarity


In [267]:
### us_ictrp
us = list_us[0:5]
ictrp = list_ictrp[8600:8800]
us_ictrp = lists_combinations(us,ictrp) 
df_us_ictrp = pd.DataFrame(us_ictrp, columns=['id_US','year_US','long_title_US','id_ICTRP','year_ICTRP','long_title_ICTRP','similarity'])
df_us_ictrp

Unnamed: 0,id_US,year_US,long_title_US,id_ICTRP,year_ICTRP,long_title_ICTRP,similarity


### TEST to justify the similarity

In [19]:
df_test_eu_us = pd.read_csv('../test for similarity/test_for_title_similarity.csv', sep='|')
df_test_eu_us = df_test_eu_us

In [20]:
len(df_test_eu_us)

2645

In [21]:
df_test_eu_us.long_title_eu[0]

'Phase I/II Open-Label, Pharmacokinetic and Safety Study of a Novel Protease Inhibitor (BMS-232632, ATAZANAVIR, ATV, REYATAZâ\x84¢) in Combination Regimens in Antiretroviral Therapy (ART)-NaÃ¯ve and Experienced HIV-Infected Infants, Children, and Adolescents'

In [22]:
sim_list = []
for index, row in df_test_eu_us.iterrows():
    eu_title = str(df_test_eu_us.long_title_eu[index])
    us_title = str(df_test_eu_us.long_title_us[index])
    
    eu_title = re.escape(eu_title).lower()
    eu_title = eu_title.translate(str.maketrans('','',string.punctuation + string.whitespace))
    us_title = re.escape(us_title).lower()
    us_title = us_title.translate(str.maketrans('','',string.punctuation + string.whitespace))
    
    if pd.isnull(eu_title) or pd.isnull(us_title) :
        sim_list.append(999999999)
    else :
        sim_list.append(similar(eu_title,us_title))
    
df_test_eu_us['title_similarity'] = sim_list

In [23]:
sim_list

[0.9855769230769231,
 0.7086614173228346,
 0.9923076923076923,
 1.0,
 0.9925925925925926,
 0.9272237196765498,
 1.0,
 0.8298507462686567,
 0.9379310344827586,
 1.0,
 0.8491379310344828,
 0.43103448275862066,
 1.0,
 1.0,
 0.6416184971098265,
 1.0,
 0.9181636726546906,
 1.0,
 0.9798387096774194,
 0.9195979899497487,
 1.0,
 0.988835725677831,
 1.0,
 0.9171717171717172,
 1.0,
 1.0,
 1.0,
 0.9970326409495549,
 1.0,
 1.0,
 1.0,
 1.0,
 0.05309734513274336,
 1.0,
 0.7349397590361446,
 0.5590062111801242,
 0.02666666666666667,
 0.02666666666666667,
 0.9652777777777778,
 0.9977827050997783,
 0.9798994974874372,
 0.994328922495274,
 0.38489208633093525,
 1.0,
 0.8571428571428571,
 0.6075085324232082,
 1.0,
 0.02843601895734597,
 0.02197802197802198,
 1.0,
 0.885,
 1.0,
 1.0,
 1.0,
 0.9755102040816327,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5436893203883495,
 1.0,
 1.0,
 0.7931873479318735,
 1.0,
 0.9949748743718593,
 0.1794871794871795,
 1.0,
 1.0,
 1.0,
 0.3008849557522124,
 1.0,
 0.9447852760736196,
 1.0,


In [24]:
df_test_eu_us.to_excel('../test for similarity/test_similarity_eu_us2.xlsx')

In [76]:
## SEQUENCE MATCHER
a2 = similar('An Open-label, Non-comparative, Multi-centre Study to Assess the Efficacy and Safety of Bicalutamide When Used in Combination With Anastrozole for the Treatment of Gonadotropin-independent Precocious Puberty in Boys With Testotoxicosis', 
             '')
a2

0.0

In [63]:
## SEQUENCE MATCHER
b2 = similar('An Open-label, Non-comparative, Multi-centre Study to Assess the Efficacy and Safety of Bicalutamide When Used in Combination With Anastrozole for the Treatment of Gonadotropin-independent Precocious Puberty in Boys With Testotoxicosis', 
             'An open-label, non-comparative, multi-centre study to assess the efficacy and safety of bicalutamide when used in combination with anastrozole for the treatment of gonadotropin-independent precocious puberty in boys with testotoxicosis.')
b2

0.9978768577494692

In [197]:
## SEQUENCE MATCHER
c2 = similar('A Phase 2, Multicenter, Randomized, Double-blind, Placebo-controlled, Parallel-group, Dose-ranging Study Evaluating the Efficacy and Safety of CNTO 148 Administered Subcutaneously in Symptomatic Subjects With Severe Persistent Asthma', 
             'A Phase 2, Randomized, Double-Blind, Crossover Study to Examine the Pharmacodynamics, Safety and Tolerability, and Pharmacokinetics of Single Doses of TD-4208 in Subjects Diagnosed with Chronic Obstructive Pulmonary Disease|Effects of TD-4208 on FEV1 in Subjects with COPD')
c2

0.22574257425742575

In [154]:
ch1 = re.escape('A Phase 2, Randomized, Double-Blind, Placebo-Controlled Study to Evaluate the Safety, Efficacy, Pharmacodynamics, and Pharmacokinetics of RG-012 for Injection in Subjects With Alport Syndrome').lower()
ch2 = re.escape('A Phase 2, Randomized, Double-Blind, Placebo-Controlled Study to Evaluate the Safety, Pharmacodynamics, Pharmacokinetics, Dose Selection, and Preliminary Efficacy of Weekly RG 012 Injections in Patients with Alport Syndrome').lower()

ch1 = ch1.translate(str.maketrans('','',string.punctuation + string.whitespace))
ch2 = ch2.translate(str.maketrans('','',string.punctuation + string.whitespace))

if ch1 in ch2 : 
    print("chaine1 is in chaine2")
else :
    print("chaine1 is not in chaine2")


chaine1 is not in chaine2


In [123]:
print(re.escape('A Safety and Immunogenicity Study of Quadrivalent HPV.... '))

A\ Safety\ and\ Immunogenicity\ Study\ of\ Quadrivalent\ HPV\.\.\.\.\ 


In [96]:
re.escape('A Safety') in re.escape('A Safety and Immunogenicity Study of Quadrivalent HPV.... ')


True

In [124]:
def compare(s1, s2):
    remove = string.punctuation + string.whitespace
    return s1.translate(str.maketrans('','',string.punctuation)) in s2.translate(str.maketrans('','',string.punctuation))

In [125]:
compare(ch1,ch2)

False

In [135]:
ch1.translate(str.maketrans('','',string.punctuation)) in ch2.translate(str.maketrans('','',string.punctuation))

True

In [134]:
ch2.translate(str.maketrans('','',string.punctuation))

'A Safety and Immunogenicity Study of Quadrivalent HPV A Long Term Immunogenicity Safety and Effectiveness Study of GARDASILâ\x84¢ Human Papillomavirus Types 6 11 16 18 Recombinant Vaccine Among Adolescents Who Received GARDASILâ\x84¢ at 918 Years of Age'

In [133]:
ch1.translate(str.maketrans('','',string.punctuation))

'A Safety and Immunogenicity Study of Quadrivalent HPV '

In [3]:
## VERIFICATION 

title1 = 'Atropine'
title2 = "drug delivery system, inhaled atropine, MicroDose Therapeutx"


ltitle1 = re.escape(title1).lower()
ltitle1 = ltitle1.translate(str.maketrans('','',string.punctuation + string.whitespace))
ltitle2 = re.escape(title2).lower()
ltitle2 = ltitle2.translate(str.maketrans('','',string.punctuation + string.whitespace))
            
            
sim = similar(title1,title2)            
            
print(sim)            
            
            

0.23529411764705882
