In [None]:
%matplotlib inline
import tk
import numpy as np
import pandas as pd 
import datetime
%config InlineBackend.figure_formats = ['svg']
from matplotlib import pyplot as plt 
import seaborn as sns; sns.set()
import plotly.graph_objects as go
import chart_studio
import chart_studio.plotly as py
import requests 
import random 
import plotly.express as px
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="YOUR_TOOL_HERE")
from geopy.exc import GeocoderTimedOut
from geopy.extra.rate_limiter import RateLimiter




The following cell is only necessary if you want to upload the maps to chart-studio later.

In [None]:
chart_studio.tools.set_credentials_file(username='YOUR_USERNAME_HERE', api_key='YOUR_KEY_HERE')

# Loading and preprocessing data

In [None]:
df_ms = pd.read_excel("1_multiple_sclerosis/ms_digital_trials.xlsx")
df_ms["Indication"] = "MS"

df_alz = pd.read_excel("2_alzheimer/az_digital_trials.xlsx")
df_alz["Indication"] = "ALZ"

df_park = pd.read_excel("3_parkinson/pk_digital_trials.xlsx")
df_park["Indication"] = "PARK"

df_epi = pd.read_excel("4_epilepsy/ep_digital_trials.xlsx")
df_epi["Indication"] = "EPI"


In [None]:
all_ind_df = pd.concat([df_ms, df_alz, df_park, df_epi])
print(len(all_ind_df))

In [None]:
all_ind_df = all_ind_df[(all_ind_df["criteria_met"] == 1)]
print(len(all_ind_df))

In [None]:
all_ind_df = all_ind_df.reset_index(drop = True)

In [None]:
all_ind_df["study_type"] = pd.Categorical(all_ind_df["study_type"], [ "Observational [Patient Registry]", "Observational","Interventional" ])


Read in actual titles in a format that is suitable for display later. Next two cells can be skipped if dataset already contains official titles.

In [None]:
def title_reader(id):
    a = "https://www.clinicaltrials.gov/api/query/study_fields?expr="
    c = "&fields=OfficialTitle&min_rnk=1&max_rnk=&fmt=csv"
    request = a+id+c
    trial_title = pd.read_csv(request,skiprows=10) 
    return trial_title["OfficialTitle"][0]



In [None]:
all_ind_df['official_title'] = all_ind_df.apply(lambda row : title_reader(row["nct_id"]), axis = 1)

In [None]:
print_me = all_ind_df.groupby("nct_id", sort = True)["nct_id"].count().sort_values(ascending=False) #10 studies with duplicates
print_me[:11]

# Analysis

## Longitudinal study type development


In [None]:
rel_years = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021]
dht_per_year = []
for year in rel_years:
    dht_per_year.append(len(all_ind_df[(all_ind_df["year"] == year)]))

all_per_year= [431,442, 472, 508, 542, 604, 625, 631,743, 825, 719, 882]  
rate_df = pd.DataFrame( data = {"All_Trials":all_per_year, "DHT_Trials":dht_per_year})
rate_df["rate"] = round(rate_df["DHT_Trials"]/rate_df["All_Trials"]*100,2)
rate_df




In [None]:
###Summary Graph with rate

fig = plt.figure(figsize=(8.05,3))
sns.set_theme(style="white")
ax = fig.add_subplot(111)

fig1 = sns.histplot(ax = ax,data = all_ind_df , x = "year",   binwidth = 1, discrete = True, hue = "study_type", palette= ["skyblue", "indianred", "gold", ], multiple= "stack", shrink = 0.8)
ax.legend(["Interventional", "Observational", "Registries"], fontsize = "x-small")
handles1=  ax.get_legend().legendHandles
ax.get_legend().remove()
plt.ylabel("Number of Studies")
plt.xlabel("Year")


ax2 = ax.twinx()
fig2 = sns.lineplot(ax = ax2, x = rel_years, y = rate_df.rate, color = "#0A2342", linestyle = "--",label = '% of all ClinicalTrials.gov\n Registered Trials')
handles2,labels2 = ax2.get_legend_handles_labels()
handles = handles1+handles2
labels= ["Interventional", "Observational", "Registries", '% of All ClinicalTrials.gov\n Registered Trials']
ax2.legend(handles, labels,fontsize = "x-small", frameon = True)
plt.ylabel("% of All Studies")
ax.tick_params(left=True, bottom=False, right = False)
ax2.tick_params(left=False, bottom=False, right = True)
sns.despine(left = True)

plt.title("Development of Study Types")
plt.xticks([2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021])

plt.savefig('Graphs/99_AGG_Frequency_plot_by_study_type_and_rate.svg', dpi = 600, bbox_inches = 'tight')
plt.savefig('Graphs/99_AGG_Frequency_plot_by_study_type_and_rate.eps',  bbox_inches = 'tight')


plt.show()

In [None]:
all_ind_df["Indication"] = pd.Categorical(all_ind_df["Indication"], [ "EPI", "ALZ", "MS", "PARK" ])

## Defining colors

In [None]:
pal = sns.color_palette("hls", 15)


In [None]:
index_list = [0,2,8,11]
pal = pal.as_hex()
colors = [ pal[i] for i in index_list]
pal = sns.color_palette(["indianred", "gold", "skyblue", "#9457db"] )

## Longitudinal indication development

In [None]:
###Orientation 2

plt.figure(figsize=(8.05,3))
sns.set_theme(style="white")
ax = sns.histplot(data = all_ind_df , x = "year",   binwidth = 1, discrete = True, hue = "Indication", palette= pal, multiple= "stack", shrink = 0.8, label = "Stackbar")
handles,labels = ax.get_legend_handles_labels()
handles = handles[::-1]
labels = ["Epi.", "AD", "MS", "PD"]
ax.legend(handles, labels,fontsize = "x-small")
plt.xticks([2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021])
ax.tick_params(left=True, bottom=False, right = False)
sns.despine(left = True)


plt.ylabel("Number of Studies")
plt.xlabel("Year")
plt.title("Development of Studies by Indication")
plt.savefig('Graphs/99_AGG_Frequency_plot_by_indication.svg', dpi = 600, bbox_inches = 'tight')
plt.savefig('Graphs/99_AGG_Frequency_plot_by_indication.eps',  bbox_inches = 'tight')


plt.show()

## Other longitudinal development (such as industrial sponsoring)

In [None]:
all_ind_df["industry"] = pd.Categorical(all_ind_df["industry"], [ 1,0])

In [None]:
###Orientation 3

plt.figure(figsize=(8,3))
sns.set_theme(style="white")
ax = sns.histplot(data = all_ind_df , x = "year",   binwidth = 1, discrete = True, hue = "industry", palette= [ "coral","gray"], multiple= "stack", shrink = 0.8, label = "Theplot")
handles,labels = ax.get_legend_handles_labels()
handles = handles[::-1]
labels = ["Industry sponsored", "Not industry sponsored"]
ax.legend(handles, labels,fontsize = "x-small")
SaveLegendForLater = ax.legend(handles, labels,fontsize = "x-small")
plt.xticks([2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021])
ax.tick_params(left=True, bottom=False, right = False)
sns.despine(left = True)


plt.ylabel("Number of Studies")
plt.xlabel("Year")
plt.title("Development of Industry Sponsoring")
plt.savefig('Graphs/99_AGG_Frequency_plot_by_study_sponsor.svg', dpi = 600, bbox_inches = 'tight')
plt.show()




In [None]:
add_line_graph_data= pd.DataFrame( data= { "studies_year":[3,7, 4, 12, 18, 32, 31, 37,55, 74, 82, 86],
                                          "industry_year":[1,2, 0, 3, 3, 5, 11, 8,8, 13, 22, 14]})
                                  


In [None]:
add_line_graph_data

In [None]:
ax.get_legend

In [None]:
names = ["Epi.", "Alz.", "MS", "Park."]
data_industry_percentage = [ 100*len(df_epi[(df_epi['industry'] == 1)] )/ len(df_epi), 100*len(df_alz[(df_alz['industry'] == 1)] )/ len(df_alz),100* len(df_ms[(df_ms['industry'] == 1)] )/ len(df_ms), 100*len(df_park[(df_park['industry'] == 1)] )/ len(df_park)]
data_industry_percentage 
industry_sponsered_num = [len(df_epi[(df_epi['industry'] == 1)]), len(df_alz[(df_alz['industry'] == 1)]), len(df_ms[(df_ms['industry'] == 1)]), len(df_park[(df_park['industry'] == 1)])]
data_count = [len(df_epi), len(df_alz), len(df_ms), len(df_park)]
overview_df = pd.DataFrame(list(zip(names, data_count, data_industry_percentage, industry_sponsered_num)), columns = ["Name", "Count", "Industry_Percentage", "Industry_count"])


In [None]:

plt.figure(figsize=(8,3))
sns.set_theme(style="white")
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
sns.barplot(data = overview_df , x = "Name", y = "Count" ,    palette= pal, label = "Theplot", alpha = 0.7, ax = ax1)
sns.barplot(data = overview_df , x = "Name", y = "Industry_Percentage" , palette= ["darkblue"], label = "Theplot", alpha = 0.2, ax = ax2, edgecolor="w")
ax2.set_ylim(0,105)
ax1.set_xlabel('Indication')
ax1.set_ylabel('Study Counts')
ax2.set_ylabel('Percentage Industry Sponsored', color='b')
#handles,labels = ax.get_legend_handles_labels()
#handles = handles[::-1]
#labels = ["Industry sponsored", "Not industry sponsored"]
#ax.legend(handles, labels,fontsize = "x-small")

plt.title("Industry Sponsoring across Indications")
plt.savefig('Graphs/99_AGG_Industry_by_Indication.svg', dpi = 600, bbox_inches = 'tight')
plt.show()

In [None]:
from matplotlib.lines import Line2D
plt.figure(figsize=(5,3))
sns.set_theme(style="white")
ax = sns.histplot(data = all_ind_df , x = "Indication", hue ="industry", multiple = "stack", discrete =True,  alpha = 0.7, binwidth = 1, shrink = 0.5, palette= [ "coral","gray"], legend = False)

# custom_lines = [Line2D([0], [0], color="coral", lw=2),
#                 Line2D([0], [0], color="gray", lw=2)]
#ax.legend(custom_lines, ["Industry sponsored", "Not industry sponsored"])
handles = handles[::-1]
labels = [   "Industry sponsored","Not industry sponsored"]
ax.legend(handles, labels,fontsize = "x-small")

ax.set(xticklabels = ["Epi.", "AD", "MS", "PD"])
ax.tick_params(left=True, bottom=False, right = False)
sns.despine(left = True)





plt.title("Indication Studies by Sponsoring")
plt.savefig('Graphs/99_Studynumber_by_Sponsoring.svg', dpi = 600, bbox_inches = 'tight')
plt.show()

## Treemap of frequency

In [None]:
tb_MS = [len(df_ms[(df_ms['industry'] == 0)]), len(df_ms[(df_ms['industry'] == 1)])]
tb_PARK = [len(df_park[(df_park['industry'] == 0)]), len(df_park[(df_park['industry'] == 1)])]
tb_EPI = [len(df_epi[(df_epi['industry'] == 0)]), len(df_epi[(df_epi['industry'] == 1)])]
tb_ALZ = [len(df_alz[(df_alz['industry'] == 0)]), len(df_alz[(df_alz['industry'] == 1)])]
tb_all = tb_MS + tb_PARK + tb_EPI + tb_ALZ

tb_MS2 = [len(df_ms)]
tb_PARK2 = [len(df_park)]
tb_EPI2 = [len(df_epi)]
tb_ALZ2 = [len(df_alz)]
tb_all2 = tb_EPI2 + tb_ALZ2 + tb_MS2 + tb_PARK2 


In [None]:
import squarify   

# plot it
squarify.plot(sizes=tb_all2,  alpha=.8, color = pal, label = ["Epi.", "AD", "MS", "PD"], pad = True )
plt.title("Composition of Studies by Indication")
plt.axis('off')
plt.savefig('Graphs/99_AGG_Indication_Treemap.svg', dpi = 600, bbox_inches = 'tight')
plt.show()


## Overview of most commonly used tools

In [None]:
tools_df = all_ind_df.iloc[:, 53:1071]
tools_overview = tools_df.sum().sort_values(ascending = False)
tools_overview
tools_overview = tools_overview[tools_overview > 0]
print(tools_overview)
tools_overview.to_excel("Outputs/99_AGG_Tools_Overview.xlsx")



In [None]:
all_ind_df.phase.value_counts()
#sum of all phases = 50

## Analysis of features and used tools

In [None]:
phones_total = len(all_ind_df[(all_ind_df["android"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["smart_phone"] == 1) |(all_ind_df["galaxy_s4"] == 1) |  (all_ind_df["galaxy_s5"] == 1) | (all_ind_df["galaxy_s6"] == 1) | (all_ind_df["galaxy_s7"] == 1) | (all_ind_df["galaxy_s8"] == 1) | (all_ind_df["galaxy_sii"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["iphone"] == 1)])
tablets_total = len(all_ind_df[ (all_ind_df["galaxy_tab"] == 1) | (all_ind_df["ipad"] == 1)])
phone_tab_total = len(all_ind_df[(all_ind_df["android"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["smart_phone"] == 1) |(all_ind_df["galaxy_s4"] == 1) |  (all_ind_df["galaxy_s5"] == 1) | (all_ind_df["galaxy_s6"] == 1) | (all_ind_df["galaxy_s7"] == 1) | (all_ind_df["galaxy_s8"] == 1) | (all_ind_df["galaxy_sii"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["iphone"] == 1) | (all_ind_df["galaxy_tab"] == 1) | (all_ind_df["ipad"] == 1) | (all_ind_df["mobile_app"] == 1)])
print( "Phones total:  ", phones_total , "Percentage Phones: ", phones_total/len(all_ind_df), "\n" "Tablets total: ", tablets_total , "Percentage Tablets: ", tablets_total/len(all_ind_df), "\n" "All Smartphone/Tablets/Mobile Apps total: ", phone_tab_total , "Percentage: ", phone_tab_total/len(all_ind_df))


In [None]:
Ind = ["MS", "ALZ", "EPI", "PARK"]

for i in Ind:
    phones_total = len(all_ind_df[(all_ind_df["Indication"] == i) & ((all_ind_df["android"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["smart_phone"] == 1) | (all_ind_df["galaxy_s4"] == 1) |  (all_ind_df["galaxy_s5"] == 1) | (all_ind_df["galaxy_s6"] == 1) | (all_ind_df["galaxy_s7"] == 1) | (all_ind_df["galaxy_s8"] == 1) | (all_ind_df["galaxy_sii"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["iphone"] == 1))])
    tablets_total = len(all_ind_df[(all_ind_df["Indication"] == i) & ((all_ind_df["galaxy_tab"] == 1) | (all_ind_df["ipad"] == 1))])
    phone_tab_total = len(all_ind_df[(all_ind_df["Indication"] == i) & ((all_ind_df["android"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["smart_phone"] == 1) | (all_ind_df["galaxy_s4"] == 1) |  (all_ind_df["galaxy_s5"] == 1) | (all_ind_df["galaxy_s6"] == 1) | (all_ind_df["galaxy_s7"] == 1) | (all_ind_df["galaxy_s8"] == 1) | (all_ind_df["galaxy_sii"] == 1) | (all_ind_df["smartphone"] == 1) | (all_ind_df["iphone"] == 1) | (all_ind_df["galaxy_tab"] == 1) | (all_ind_df["ipad"] == 1) | (all_ind_df["mobile_app"] == 1))])
    print( f'{i}', "Phones total: ", phones_total , "Percentage Phones: ", phones_total/len(all_ind_df[(all_ind_df["Indication"] == i)]), "\n" f'{i}',"Tablets total: ", tablets_total , "Percentage Tablets: ", tablets_total/len(all_ind_df[(all_ind_df["Indication"] == i)]), "\n" f'{i}',"All Smartphone/Tablets/Mobile Apps total: ", phone_tab_total , "Percentage: ", phone_tab_total/len(all_ind_df[(all_ind_df["Indication"] == i)]))


In [None]:
physio_total = len(all_ind_df[(all_ind_df["Indication"] == "MS") &((all_ind_df["kinect"] == 1) | (all_ind_df["xbox"] == 1) | (all_ind_df["nintendo_wii"] == 1) )])
print( "Physio Virtual Training:  ",physio_total,   "Percentage: ", physio_total/len(all_ind_df[all_ind_df["Indication"] == "MS"]))


In [None]:

for i in Ind:
    physio_total = len(all_ind_df[(all_ind_df["Indication"] == i) &((all_ind_df["kinect"] == 1) | (all_ind_df["xbox"] == 1) | (all_ind_df["nintendo_wii"] == 1) )])
    print( f'{i}:', "Physio Virtual Training:  ",physio_total,   "Percentage: ", physio_total/len(all_ind_df[all_ind_df["Indication"] == i]))


In [None]:

for i in Ind:
    industry_ind_total = len(all_ind_df[(all_ind_df["Indication"] == i) &(all_ind_df["industry"] == 1)])
    print( f'{i}:', "Percentage Industry Sponsored:  ",industry_ind_total,   "Percentage: ", industry_ind_total/len(all_ind_df[all_ind_df["Indication"] == i]))


In [None]:
kinesia_total = len(all_ind_df[(all_ind_df["Indication"] == 'PARK') &(all_ind_df["kinesia"] == 1) ])
print( f'{i}:', "Kinesia in Parkinson:  ",physio_total,   "Percentage: ", kinesia_total/len(all_ind_df[all_ind_df["Indication"] == 'PARK']))


In [None]:
len(all_ind_df[(all_ind_df["year"] == 2010) ]) #3 of 431
len(all_ind_df[(all_ind_df["year"] == 2021) ]) #86 of 880


In [None]:
len(all_ind_df[((all_ind_df["year"] == 2021) & (all_ind_df["industry"] ==1 )) ]) #3 of 431


In [None]:
#Binarised outcomes

b_outcomes = ["gamification",	"motor_function",	"exercising",	"disease_or_symptom_tracking",	"sleep_tracking",	"speech_tracking",	"carer_support",	"cognition_tracking",	"medication_adherence"]


In [None]:
for outcome in b_outcomes:
    all_stuff = len(all_ind_df[(all_ind_df[outcome] == 1)])
    print( f'{outcome}:', all_stuff,   "Percentage: ", all_stuff/len(all_ind_df))


In [None]:
rel_years = [2010, 2012, 2014, 2016, 2018, 2020, 2021]

In [None]:
for outcome in b_outcomes:
    for i in Ind:
        all_stuff = len(all_ind_df[(all_ind_df["Indication"] == i) &((all_ind_df[outcome] == 1) )])
        print( f'{outcome}:', i,  all_stuff,   "Percentage: ", all_stuff/len(all_ind_df[all_ind_df["Indication"] == i]))


In [None]:
all_ind_df["year"]

In [None]:
for year in rel_years:
    for i in Ind:
        number_of_studies = len(all_ind_df[(all_ind_df["Indication"] == i) &((all_ind_df["year"] == year) )])
        print( f'{year}:', i,  number_of_studies,   )


In [None]:
rel_years = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021]

In [None]:
for year in rel_years:
    print(year, len(all_ind_df[((all_ind_df["year"] == year) )]))

In [None]:

for i in Ind:
    ind_num = len(all_ind_df[(all_ind_df["Indication"] == i) ])
    print(  i,  ind_num,   )


In [None]:
for i in Ind:
    physio_total = len(all_ind_df[(all_ind_df["Indication"] == i) &((all_ind_df["kinect"] == 1) | (all_ind_df["xbox"] == 1) | (all_ind_df["nintendo_wii"] == 1) )])
    print( f'{i}:', "Physio Virtual Training:  ",physio_total,   "Percentage: ", physio_total/len(all_ind_df[all_ind_df["Indication"] == i]))


In [None]:
game_res = all_ind_df.groupby("year")["gamification"].value_counts(normalize=True, sort = False)
game_res = game_res[1::2]

In [None]:
all_ind_df.groupby("year")["gamification"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["sleep_tracking"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["speech_tracking"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["cognition_tracking"].value_counts(normalize=True, sort = False)


In [None]:
exer_res = all_ind_df.groupby("year")["exercising"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["motor_function"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["carer_support"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["exercising"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["disease_or_symptom_tracking"].value_counts(normalize=True, sort = False)


In [None]:
all_ind_df.groupby("year")["medication_adherence"].value_counts(normalize=True,sort = False)


### Analysis of longitudinal developments of features

In [None]:
line_graph_data = all_ind_df.groupby("year").agg({i:'value_counts' for i in ["motor_function", "sleep_tracking","cognition_tracking", "speech_tracking"]})
line_graph_data = line_graph_data[::2]


line_graph_data.index = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021]


In [None]:
print(line_graph_data)

In [None]:
line_graph_data['all_trials'] = [431,442, 472, 508, 542, 604, 625, 631,743, 825, 719, 882]
line_graph_data["share_motor"] = line_graph_data.motor_function/line_graph_data.all_trials*100
line_graph_data["share_sleep"] = line_graph_data.sleep_tracking/line_graph_data.all_trials*100
line_graph_data["share_cognition"] = line_graph_data.cognition_tracking/line_graph_data.all_trials*100
line_graph_data["share_speech"] = line_graph_data.speech_tracking/line_graph_data.all_trials*100

In [None]:
# Line Chart of Tracking Modalities
fig, axes = plt.subplots(2,2, figsize = (8,5.5), sharex = True, sharey = False, gridspec_kw = {'wspace':0.06, 'hspace':0.5})
#fig.suptitle("Tracking Modalities Used in Analyzed Trials Over Time", y = 1.06)
sns.set_theme(style="white")



sns.barplot(ax = axes[0,0],  y= line_graph_data.motor_function, x= line_graph_data.index, color = "DarkOliveGreen", label ="Number of DHT Trials")
ax2 = axes[0,0].twinx()
plot2 = sns.lineplot(ax = ax2, x = axes[0,0].get_xticks(), y = line_graph_data.share_motor, color = "Orange", label= '% of all ClinicalTrials.gov\n Registered Trials')
handles1,labels1 = axes[0,0].get_legend_handles_labels()
handles2,labels2 = ax2.get_legend_handles_labels()
handles = handles1+handles2
labels= labels1+labels2
ax2.legend(handles, labels,fontsize = "x-small")



sns.barplot(ax = axes[0,1], label = "sleep_tracking",  y= line_graph_data.sleep_tracking, x= line_graph_data.index, color = "DarkOliveGreen")
ax3 = axes[0,1].twinx()
plot3 = sns.lineplot(ax = ax3, x = axes[0,1].get_xticks(), y = line_graph_data.share_sleep, color = "Orange")
#ax3.legend(handles, labels,fontsize = "x-small", loc = 0)

sns.barplot(ax = axes[1,0], label = "cognition_tracking",  y= line_graph_data.cognition_tracking, x= line_graph_data.index, color = "DarkOliveGreen")
ax4 = axes[1,0].twinx()
plot4 = sns.lineplot(ax = ax4, x = axes[1,1].get_xticks(), y = line_graph_data.share_cognition, color = "Orange")


sns.barplot(ax = axes[1,1], label = "speech_tracking",  y= line_graph_data.speech_tracking, x= line_graph_data.index, color = "DarkOliveGreen")
ax5 = axes[1,1].twinx()
plot5 = sns.lineplot(ax = ax5, x = axes[1,1].get_xticks(), y = line_graph_data.share_speech, color = "Orange")


axes[0,0].set_title("Motor Tracking", pad = 0.5)
axes[0,1].set_title("Sleep Tracking", pad = 0.5)
axes[1,0].set_title("Cognition Tracking", pad = 0.5)
axes[1,1].set_title("Speech Tracking", pad = 0.5)
axes[0,0].set_ylabel("Number of DHT Trials")
axes[0,1].set_ylabel("")
axes[0,1].set_yticklabels([])
axes[0,1].set_yticks([])
axes[1,0].set_ylabel("Number of DHT Trials")
axes[1,1].set_ylabel("")
axes[1,1].set_yticklabels([])
axes[1,1].set_yticks([])

axes[1,0].set_xticklabels(axes[1,0].get_xticklabels(), rotation = 45)
axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation = 45)

axes[0,0].set_ylim(0,70)
axes[0,1].set_ylim(0,70)
axes[1,0].set_ylim(0,70)
axes[1,1].set_ylim(0,70)



ax2.set_ylim(0,8)
ax2.set_yticklabels([])
ax2.set_yticks([])
ax3.set_ylim(0,8)
ax4.set_ylim(0,8)
ax4.set_yticklabels([])
ax4.set_yticks([])
ax5.set_ylim(0,8)

ax2.set_ylabel("")
ax3.set_ylabel("% of All Trials")
ax4.set_ylabel("")
ax5.set_ylabel("% of All Trials")


sns.despine(left = True, bottom = False)
plt.savefig('Graphs/99_AGG_Tracking_Modalities_Over_Time_Hist_with_share.svg', dpi = 600, bbox_inches = 'tight')
plt.savefig('Graphs/99_AGG_Tracking_Modalities_Over_Time_Hist_with_share.eps', dpi = 600, bbox_inches = 'tight')

plt.show()

## Save tools used in studies

In [None]:
Tools_in_Studies = pd.DataFrame(columns= ["ID", "Tools"])
for i in range(len(all_ind_df)):    
    Tools_in_Studies.at[ i, "ID"] = all_ind_df["nct_id"][i]
    my_tools = tools_df.iloc[[i]][(tools_df.iloc[[i]] > 0)].copy()
    my_tools.dropna(axis = 1, inplace = True)    
    Tools_in_Studies.at[ i, "Tools"] = list(my_tools.columns)   
    

Tools_in_Studies.to_excel("Outputs/99_AGG_Tools_in_Studies.xlsx")


## Location and map analysis

In [None]:
all_ind_df["main_site"] = all_ind_df["main_site"].fillna("")
all_ind_df["main_country"] = all_ind_df["main_country"].fillna("")

In [None]:
all_ind_df["CombiCode"] = all_ind_df[[ "main_site",  'main_country']].agg(', '.join, axis=1)



In [None]:
def do_geocode(address, attempt=1, max_attempts=5):
    try:
        return geopy.geocode(address)
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            return do_geocode(address, attempt=attempt+1)
        raise

In [None]:
all_ind_df["geocode"] = all_ind_df["CombiCode"].apply(RateLimiter(geolocator.geocode, min_delay_seconds=1))



In [None]:
####add noise to coordinates!

In [None]:
loc_df = all_ind_df[~pd.isna(all_ind_df["geocode"])]


In [None]:
loc_df["latitude"] = loc_df["geocode"].apply(lambda x: (x.latitude))
loc_df["longitude"] = loc_df["geocode"].apply(lambda x: (x.longitude))


In [None]:
loc_df["latitude"]
import random 

loc_df["latitude"] = loc_df["latitude"].apply(lambda x: np.add(random.uniform(-0.05, 0.05), float(x)))
loc_df["longitude"] = loc_df["longitude"].apply(lambda x: np.add(random.uniform(-0.05, 0.05), float(x)))


In [None]:
all_coutries = all_ind_df.groupby("main_country", sort = True)["nct_id"].count().sort_values(ascending=False)

In [None]:
all_ind_df.groupby("main_country", sort = True)["nct_id"].count().sort_values(ascending=False)

In [None]:
all_ind_df.groupby("main_country", sort = True)["nct_id"].count().sort_values(ascending=False)

In [None]:
countries_by_indication = all_ind_df.groupby(["main_country", "Indication"], sort = True)["nct_id"].count()
countries_by_indication.to_excel("Outputs/Countries_and_Indication.xlsx")

In [None]:
all_data_regardless_ind = loc_df.copy()
all_data_regardless_ind["Indication"] = "All"
all_data_regardless_ind = loc_df.append(all_data_regardless_ind, ignore_index= True)
nice_names = {"MS": "Multiple Sclerosis", "ALZ": "Alzheimer's Disease", "PARK" : "Parkinson's Disease", "EPI":"Epilepsy"}
all_data_regardless_ind["Indication"] = all_data_regardless_ind.Indication.replace(nice_names, regex = True) 



In [None]:
all_data_regardless_ind.rename(columns={'CombiCode':'Location', 'official_title': "Official Title"}, inplace=True)

In [None]:
loc_df["latitude"] = loc_df["latitude"].apply(lambda x: np.add(random.uniform(-0.05, 0.05), float(x)))
loc_df["longitude"] = loc_df["longitude"].apply(lambda x: np.add(random.uniform(-0.05, 0.05), float(x)))



In [None]:
pal

In [None]:
pal = pal.as_hex()
print(pal)

In [None]:
loc_df["Indication"].unique()


In [None]:
nice_names = {"MS": "Multiple Sclerosis", "ALZ": "Alzheimer's Disease", "PARK" : "Parkinson's Disease", "EPI":"Epilepsy"}

loc_df["Indication"] = loc_df["Indication"].cat.rename_categories({"MS": "Multiple Sclerosis", "ALZ": "Alzheimer's Disease", "PARK" : "Parkinson's Disease", "EPI":"Epilepsy"})
loc_df.rename(columns={'CombiCode':'Location', 'official_title': "Official Title"}, inplace=True)

In [None]:
loc_df.rename(columns={'CombiCode':'Location', 'official_title': "Official Title"}, inplace=True)

In [None]:
all_ind = list(loc_df["Indication"].unique())
color_match = {"Epilepsy":'#cd5c5c' , "Alzheimer's Disease":'#ffd700', "Multiple Sclerosis":'#87ceeb', "Parkinson's Disease":'#9457db'}

In [None]:
fig = px.density_mapbox(all_data_regardless_ind, lat="latitude", lon="longitude", radius=15, 
                        center=dict(lat=52, lon=10), zoom=2, color_continuous_scale = "Greys", animation_frame= "Indication", opacity= 0.6,
                        hover_name= "nct_id", hover_data= {"latitude": False, "longitude": False, "Official Title" :True, "Location" : True },
                        mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

for i in all_ind:
    ind_df = loc_df[(loc_df.Indication == i)]
    fig.add_trace(go.Scattermapbox(
        name = i,
        lon = ind_df["longitude"],
        lat = ind_df["latitude"],
        hoverinfo = "skip",
        line = dict(
        # width = 1,
            color = color_match[i]),
        mode = 'markers',
        opacity = 0.7,
        below =  "", 

        ))
fig.update(layout_coloraxis_showscale=False)

#Optional line:
py.plot(fig, filename = 'DT_Locations_with_underlying_heatmap', auto_open=True)



fig.write_html("Graphs/Location_with_underlying_heatmap.html")
fig.show()