In [2]:
import csv
import pandas as pd
from difflib import SequenceMatcher
import itertools
from fuzzywuzzy import fuzz, process
import pickle

from dask import delayed, compute

In [3]:
### us_title

df_us = pd.read_csv('../treatment_Data/trial_title/us_title.csv', sep='|')
# Add an indicator column
df_us['trial_indicator'] = 'US'
# Determining duplicates
mask = df_us.duplicated(keep=False)
double = df_us[mask]   #len(double) = 0 ==> no duplicates in df_us
df_us[0:4]

Unnamed: 0,trial_id,short_title,long_title,trial_indicator
0,NCT03260985,Precision Psychiatry Continuity Clinic Project,Precision Psychiatry Continuity Clinic Project,US
1,NCT03268473,Effect of Non-surgical Periodontal Treatment o...,Effect of Non-surgical Periodontal Treatment o...,US
2,NCT03262610,Setemelanotide in a Single Patient With Partia...,Expanded-access for the Use of Setemelanotide ...,US
3,NCT03264261,Constraint Induced Movement Therapy for Walkin...,Constraint Induced Movement Therapy for Walkin...,US


In [4]:
len(df_us)

268859

In [5]:
### eu_title

df_eu = pd.read_csv('../treatment_Data/trial_title/eu_title.csv', sep='|')
# Add an indicator column
df_eu['trial_indicator'] = 'EU'
# Determining duplicates
mask = df_eu.duplicated(keep=False)
double = df_eu[mask]   #len(double) = 0 ==> no duplicates in df_eu
df_eu[0:4]

Unnamed: 0,trial_id,long_title,trial_indicator
0,2004-000007-18,"A Multicentre, Randomised, Double-Blind, Paral...",EU
1,2004-000012-13,"A Double-Blind, Placebo-Controlled, Parallel, ...",EU
2,2004-000015-25,"A phase 3 randomized, placebo-controlled, doub...",EU
3,2004-000016-10,"A 3-MONTH, RANDOMIZED, DOUBLE-BLIND, PLACEBO- ...",EU


In [6]:
len(df_eu)

29592

In [7]:
### ictrp_title

df_ictrp = pd.read_csv('../treatment_Data/trial_title/ictrp_title.csv', sep='|')
# Add an indicator column
df_ictrp['trial_indicator'] = 'ICTRP'
# Determining duplicates
mask = df_ictrp.duplicated(keep=False)
double = df_ictrp[mask]   #len(double) = 0 ==> no duplicates in df_ictrp
df_ictrp[0:4]

Unnamed: 0,trial_id,short_title,long_title,trial_indicator
0,ACTRN12605000058673,Magnesium in Aneurysmal Subarachnoid Haemorrhage,"A multi-centre, single blinded, randomised con...",ICTRP
1,ACTRN12605000059662,"Multicentre, Unblinded, Randomised, Controlled...","Multicentre, Unblinded, Randomised, Controlled...",ICTRP
2,ACTRN12605000060640,A trial of G-CSF in septic shock excluding mel...,A single centre double blinded randomised cont...,ICTRP
3,ACTRN12605000061639,Randomised controlled trial of maintenance flu...,Randomised controlled trial of maintenance flu...,ICTRP


In [None]:
len(df_ictrp)

### Functions

In [None]:
### calculate similarity with "SequenceMatcher" isJunk space
def similar(a, b):
    return SequenceMatcher(lambda x: x == " ", a, b).ratio()

In [None]:
### count words in the title
def word_count(string):
    tokens = string.split()
    n_tokens = len(tokens)
    return n_tokens 

In [None]:
### LIST COMBINATIONS with calculating the similarity
def lists_combinations_similarity(list1,list2):
    combination_list = []
    for i in list1:
        for j in list2: 
            similarity = similar(str(i[2]),str(j[2]))
            if (similarity > 0.8) :
                trial_id = str(i[0])+' _ '+str(j[0])
                trial_indicatror = str(i[1])+' _ '+str(j[1])
                title = str(i[2])+' _ '+str(j[2])

                new_tuple = (trial_id,trial_indicatror, title, similarity)
                combination_list.append(new_tuple)

    return(combination_list)

In [None]:
### LIST COMBINATIONS with calculating the difference in titles length
def lists_combinations_length(list1,list2):
    combination_list = []
    for i in list1:
        for j in list2: 
            diff_length_l = abs(word_count(str(i[2]))-word_count(str(j[2])))
            if (diff_length_l < 2) :
                trial_id = str(i[0])+' _ '+str(j[0])
                trial_indicatror = str(i[1])+' _ '+str(j[1])
                title = str(i[2])+' _ '+str(j[2])

                new_tuple = (trial_id,trial_indicatror, title, diff_length_l)
                combination_list.append(new_tuple)

    return(combination_list)

### For the short titles

In [8]:
list_short_us = df_us[['trial_id','trial_indicator','short_title']].apply(tuple, axis=1).tolist()
list_short_us[0:2]

[('NCT03260985', 'US', 'Precision Psychiatry Continuity Clinic Project'),
 ('NCT03268473',
  'US',
  'Effect of Non-surgical Periodontal Treatment on Oxidative Stress and Antioxidant Status in OSAS')]

In [None]:
list_short_ictrp = df_ictrp[['trial_id','trial_indicator','short_title']].apply(tuple, axis=1).tolist()
list_short_ictrp[0:2]

In [None]:
### test for short titles
us = list_short_us[0:50000]
ictrp = list_short_ictrp[0:50000]
us_ictrp = lists_combinations_length(us,ictrp) 
df_us_ictrp = pd.DataFrame(us_ictrp, columns=['id1_id2', 'indicator1_indicator2', 'shorttitle1_shorttitle2', 'diff_length_s'])

In [None]:
df_us_ictrp[0:4]

In [None]:
len(df_us_ictrp)

In [None]:
### test for short titles CONTINUES
us = list_short_us[0:10000]
ictrp_con = list_short_ictrp[10000:108931]
us_ictrp_con = lists_combinations_length(us,ictrp_con) 
df_us_ictrp.append(us_ictrp_con)

In [None]:
len(us_ictrp_con)

In [None]:
len(df_us_ictrp)

In [None]:
df_us_ictrp.to_excel('us_ictrp.xlsx')

In [None]:
with open('./Pickles/df_us_ictrp.pickle', 'wb') as f:
   pickle.dump(df_us_ictrp, f)

In [None]:
list_short_title = lists_combinations_length(list_short_us,list_short_ictrp)

In [None]:
df_all_short_titles = pd.DataFrame(list_us_ictrp, columns=['id1_id2', 'indicator1_indicator2', 'shorttitle1_shorttitle2', 'diff_length_s'])
df_all_short_titles[0:5]

### For the long titles

In [None]:
list_long_us = df_us[['trial_id','trial_indicator','long_title']].apply(tuple, axis=1).tolist()
list_long_us[0:2]

In [None]:
list_long_ictrp = df_ictrp[['trial_id','trial_indicator','long_title']].apply(tuple, axis=1).tolist()
list_long_ictrp[0:2]

In [None]:
list_long_eu = df_eu[['trial_id','trial_indicator','long_title']].apply(tuple, axis=1).tolist()
list_long_eu[0:2]

In [None]:
list_long_us_ictrp = lists_combinations_length(list_long_us,list_long_ictrp)

In [None]:
list_long_us_eu = lists_combinations_length(list_long_us,list_long_eu)

In [None]:
list_long_ictrp_eu = lists_combinations_length(list_long_ictrp,list_long_eu)

In [None]:
list_long_title = list_long_us_ictrp + list_long_us_eu + list_long_ictrp_eu

In [None]:
df_all_long_titles = pd.DataFrame(list_long_title, columns=['id1_id2', 'indicator1_indicator2', 'longtitle1_longtitle2', 'diff_length_l'])
df_all_long_titles[0:5]


In [None]:
### test for long titles us and ictrp
us = list_long_us[0:100]
ictrp = list_long_ictrp[0:100]
us_ictrp_long = lists_combinations_length(us,ictrp)
df_long_us_ictrp = pd.DataFrame(us_ictrp_long, columns=['id1_id2', 'indicator1_indicator2', 'longtitle1_longtitle2', 'diff_length_l'])


In [None]:
df_long_us_ictrp.head()

In [None]:
### test with DASK DELAYED for long titles
us = list_long_us[0:1000]
eu = list_long_eu[0:1000]
ictrp = list_long_ictrp[0:1000]

us_ictrp_long_new = delayed(lists_combinations)(us,ictrp)
us_eu_long_new = delayed(lists_combinations)(us,eu) 
eu_ictrp_long_new = delayed(lists_combinations)(eu,ictrp)

list_long_titles = us_ictrp_long_new + us_eu_long_new + eu_ictrp_long_new
#list_long_titles.compute()

#df_all_long_titles = pd.DataFrame(list_long_titles, columns=['id1_id2', 'indicator1_indicator2', 'longtitle1_longtitle2', 'similarity'])
#list_long_titles[0]


In [None]:
list_long_titles.visualize()

In [None]:
list_long_titles.compute()

In [None]:
df_all_long_titles = pd.DataFrame(list_long_titles.compute(), columns=['id1_id2', 'indicator1_indicator2', 'longtitle1_longtitle2', 'diff_length_l'])
df_all_long_titles[0:5]