In [1]:
# Imports
%run ./ClinicalTrialFunctions.py

# Load data
clinical_df = pd.read_parquet('./exampleFile/DTxClinicalTrials.parquet.gzip')
colors = {'Observational': "steelblue", 'Interventional': "orange"}


In [2]:
### Examples of conditions mapping to specific mesh groups
clinical_df["conditionsJoined"] = [",".join(c) for c in clinical_df["Condition"]]

# MeSH groups for DTx to treat Addiction (eg. smoking addiction), 
print('***************')
print("Conditions containing terms 'addiction', 'cessation', 'smoking,' or 'drugs'")
drug_df = clinical_df[clinical_df["conditionsJoined"].str.lower().str.contains("addiction|cessation|smoking|drugs")]
print(drug_df["conditionMeshMainBranch"].value_counts())
print()
print("Conditions listed for Behavior and Behavior Mechanisms")
print(drug_df[drug_df["conditionMeshMainBranch"]!="Chemically-Induced Disorders"]['conditionsJoined'].values)
print()
print("Conditions listed for Chemically-Induced Disorders")
print(drug_df[drug_df["conditionMeshMainBranch"]=="Chemically-Induced Disorders"]['conditionsJoined'].values)
print('\n***************\n')

# Mental health (eg. depression and anxiety)
print("Conditions containing terms 'depression' or 'anxiety'")
mental_health_df = clinical_df[clinical_df["conditionsJoined"].str.lower().str.contains("depression|anxiety")]
print(mental_health_df["conditionMeshMainBranch"].value_counts())
print()

# Which conditions conditions containing depression or anxity are listed as "behavior" vs "mental disorder"?
print("Conditions listed for Behavior and Behavior Mechanisms")
print(mental_health_df[mental_health_df["conditionMeshMainBranch"]=="Behavior and Behavior Mechanisms"]['conditionsJoined'].values)
print()
print("Conditions listed for Mental Disorders")
print(mental_health_df[mental_health_df["conditionMeshMainBranch"]=="Mental Disorders"]['conditionsJoined'].values)
print('\n***************\n')

# Diabetes
print("Conditions containing terms 'diabetes', 'T1D', or 'T2D'")
diabetes_df = clinical_df[clinical_df["conditionsJoined"].str.lower().str.contains("diabetes|t1d|t2d")]
print(diabetes_df["conditionMeshMainBranch"].value_counts())
print(diabetes_df[diabetes_df["conditionMeshMainBranch"]=="Urogenital Diseases"]['conditionsJoined'].values)

print('\n***************\n')

# Examples of nervous system diseases
print("Most common Nervous System Diseases")
nervous_df = clinical_df[clinical_df["conditionMeshMainBranch"]=="Nervous System Diseases"]
nervous_df = nervous_df.explode("Condition")
print(nervous_df["Condition"].value_counts().iloc[:10])

print('\n***************\n')

# Examples 2
print("Most common Nutritional and Metabolic Diseases ")
nervous_df = clinical_df[clinical_df["conditionMeshMainBranch"]=="Nutritional and Metabolic Diseases"]
nervous_df = nervous_df.explode("Condition")
print(nervous_df["Condition"].value_counts().iloc[:10])

print('\n***************\n')

# Examples 3
print("Most common Pathological Conditions")
nervous_df = clinical_df[clinical_df["conditionMeshMainBranch"].str.contains("Pathological Conditions")]
nervous_df = nervous_df.explode("Condition")
print(nervous_df["Condition"].value_counts().iloc[:10])



***************
Conditions containing terms 'addiction', 'cessation', 'smoking,' or 'drugs'
Behavior and Behavior Mechanisms    7
Chemically-Induced Disorders        1
Name: conditionMeshMainBranch, dtype: int64

Conditions listed for Behavior and Behavior Mechanisms
['Smoking Cessation,Smoking Behaviors,Smoking Reduction,Smoking, Cigarette,Smoking,Nicotine Dependence'
 'Smoking Cessation,Smoking,Smoking Behaviors,Smoking Reduction,Smoking, Tobacco,Smoking, Cigarette,Hiv,HIV/AIDS'
 'HIV/AIDS,Smoking Cessation,Tobacco Use Disorder'
 'Depression,Addiction,Anxiety,Sleep Disturbance' 'Smoking Cessation'
 'Smoking,Smoking Cessation' 'Smoking Cessation']

Conditions listed for Chemically-Induced Disorders
['Smoking Cessation,Nicotine Addiction,Drug Addiction,Drug Dependence,Tobacco Dependence,Tobacco Use Disorder,Substance Use Disorder,Tobacco Smoking']

***************

Conditions containing terms 'depression' or 'anxiety'
Behavior and Behavior Mechanisms               16
Mental Disorders  

In [3]:
# Examples of nervous system diseases
print("Most common Pathological Conditions")
nervous_df = clinical_df[clinical_df["conditionMeshMainBranch"].str.contains("Respiratory Tract Diseases")]
nervous_df = nervous_df.explode("Condition")
print(nervous_df["Condition"].value_counts().iloc[:10])


Most common Pathological Conditions
Asthma                                   8
Chronic Obstructive Pulmonary Disease    3
Pulmonary Arterial Hypertension          1
Asthma Attack                            1
Asthma, Allergic                         1
COPD                                     1
Sleep Apnea, Obstructive                 1
Asthma in Children                       1
Chronic Respiratory Disease              1
Copd                                     1
Name: Condition, dtype: int64


In [143]:
### Do the phases/study design differ based on disease areas?
from scipy.stats import chi2_contingency

# Load data only for MeSH groups with at least 10 trials
clinical_df = pd.read_parquet('./exampleFile/DTxClinicalTrials.parquet.gzip')
values = clinical_df["conditionMeshMainBranch"].value_counts()
mesh_df = clinical_df[clinical_df["conditionMeshMainBranch"].isin(values[values>10].index)]

# Interventional trials only
mesh_df = mesh_df[mesh_df["StudyType"]=="Interventional"]

# chi-square test for design allocation (randomized, non-randomized, or NA)
chi_df = pd.crosstab(mesh_df["conditionMeshMainBranch"], mesh_df["DesignAllocation"], normalize="index")
_, p_value, _, _ = chi2_contingency(chi_df)
print("Significant difference in Design allocation: %s"%(p_value < 0.05))

# chi-square test for design allocation ('Single Group Assignment', 'Parallel Assignment',
# 'Sequential Assignment', 'Crossover Assignment', 'Factorial Assignment'
chi_df = pd.crosstab(mesh_df["conditionMeshMainBranch"], mesh_df["DesignInterventionModel"], normalize="index")
_, p_value, _, _ = chi2_contingency(chi_df)
print("Significant difference in DesignInterventionModel: %s"%(p_value < 0.05))

# chi-square test for design allocation
chi_df = pd.crosstab(mesh_df["conditionMeshMainBranch"], mesh_df["PhaseClean"], normalize="index")
_, p_value, _, _ = chi2_contingency(chi_df)
print("Significant difference in Phase: %s"%(p_value < 0.05))


Significant difference in design allocation: False
Significant difference in DesignInterventionModel: False
Significant difference in Phase: False


In [99]:
### What is the status of these trials?
clinical_df["OverallStatus"].value_counts()


Recruiting                 170
Completed                  168
Not yet recruiting          54
Active, not recruiting      33
Enrolling by invitation     24
Name: OverallStatus, dtype: int64

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models import HdpModel

### Comparison of BERTopic to LDA
## Load BERTopic values
spacy_nlp = getSpacyNLP()

all_stopwords = spacy_nlp.Defaults.stop_words
all_stopwords |= {"patient", "subject", "participant", "studies", "study",
                  "individual", "e.g.",  "diagnosis", "participation", "participate"} 

pd.read_parquet('./exampleFile/DTxClinicalTrials.parquet.gzip')

# Remove phase 1-4 trials 
clinical_df = clinical_df[clinical_df["PhaseClean"] == "Not Applicable"]
clinical_df = clinical_df[~clinical_df["conditionMeshMainBranch"].isna()]

# Remove non-interventional trials & trials with fewer than 10 trials
clinical_df = clinical_df[clinical_df["StudyType"] == "Interventional"]
mesh_ind = clinical_df["conditionMeshMainBranch"].value_counts().loc[lambda x: x>14].index
clinical_df = clinical_df[clinical_df["conditionMeshMainBranch"].isin(mesh_ind)]
#print(len(clinical_df))

## Calculate coherence scores
# Inclusion criteria
inc_bert_df = extractIndividualEligibility(clinical_df, criteria_col="InclusionCriteria", stopwords=all_stopwords)
model, inc_bert_df = extractBERTopics(inc_bert_df, spacy_nlp, criteria_col="InclusionCriteriaEmbedClean",
                                  seed=0, nr_topics='auto')
inc_topic_values = model.get_topics().values()
inc_topic_values = list(inc_topic_values)
inc_topic_values = [[i[0] for i in v] for v in inc_topic_values]
inc_bert_df = inc_bert_df[["NCTId", "InclusionCriteriaEmbedClean", "Topics", "TopicProbs"]]

inc_topic_values_merged = dict(zip(range(-1,len(inc_topic_values)-1),["_".join(i) for i in inc_topic_values]))
inc_bert_df["TopicName"] = inc_bert_df["Topics"].map(inc_topic_values_merged)

# BERTopic coherence score
texts = list(inc_bert_df["InclusionCriteriaEmbedClean"])
texts_split = [t.split() for t in texts]
dictionary = Dictionary(texts_split)
corpus = [dictionary.doc2bow(text) for text in texts_split]

cm = CoherenceModel(model=None, texts=texts_split, dictionary=dictionary,
                                     topics=inc_topic_values,
                                     corpus=corpus,
                                     coherence='u_mass')
coherence_lda = cm.get_coherence()
print("BERTopic coherence" + str(coherence_lda))

inc_bert_df = inc_bert_df[inc_bert_df["Topics"].isin([0,1,2,3,4])]
inc_bert_df_sub = inc_bert_df.sample(n=200)
inc_bert_df_sub.to_csv("./dataOutput/Inclusion_Subsample.csv")


2023-08-08 17:43:04.458 INFO    nmslib: Loading index from /Users/bmiao/.scispacy/datasets/7e3c2133fa65605a10eb67a4cfedf8d69bc553cf192dc9d883de80b803c89c5d.fb99c660e797fcb5f0a59c23a58316e9027046d6fb0519d1ae715099da1e5baa.nmslib_index.bin
2023-08-08 17:43:04.461 INFO    nmslib: Loading regular index.
2023-08-08 17:43:04.660 INFO    nmslib: Finished loading index
2023-08-08 17:43:04.661 INFO    nmslib: Set HNSW query-time parameters:
2023-08-08 17:43:04.662 INFO    nmslib: ef(Search)         =20
2023-08-08 17:43:04.662 INFO    nmslib: algoType           =2
2023-08-08 17:43:04.663 INFO    nmslib: Set HNSW query-time parameters:
2023-08-08 17:43:04.663 INFO    nmslib: ef(Search)         =200
2023-08-08 17:43:04.664 INFO    nmslib: algoType           =2
2023-08-08 17:43:26.829 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2023-08-08 17:43:26.857 INFO    gensim.corpora.dictionary: built Dictionary<3135 unique tokens: ['18', 'age', 'older', 'years', 

BERTopic coherence-8.886949941645875


In [3]:
## Calculate coherence scores
# exclusion criteria
ex_bert_df = extractIndividualEligibility(clinical_df, criteria_col="ExclusionCriteria", stopwords=all_stopwords)
model, ex_bert_df = extractBERTopics(ex_bert_df, spacy_nlp, criteria_col="ExclusionCriteriaEmbedClean",
                                  seed=0, nr_topics='auto')
ex_topic_values = model.get_topics().values()
ex_topic_values = list(ex_topic_values)
ex_topic_values = [[i[0] for i in v if i!=""] for v in ex_topic_values]
ex_bert_df = ex_bert_df[["NCTId", "ExclusionCriteriaEmbedClean", "Topics", "TopicProbs"]]

ex_topic_values_merged = dict(zip(range(-1,len(inc_topic_values)-1),["_".join(i) for i in ex_topic_values]))
ex_bert_df["TopicName"] = ex_bert_df["Topics"].map(ex_topic_values_merged)

ex_bert_df = ex_bert_df[ex_bert_df["Topics"].isin([0,1,2,3,4])]
ex_bert_df_sub = ex_bert_df.sample(n=200)
ex_bert_df_sub.to_csv("./dataOutput/Exclusion_Subsample.csv")


In [2]:
### MeSH validation
%run ./clinicalTrialFunctions.py
clinical_df = pd.read_parquet('./exampleFile/DTxClinicalTrials.parquet.gzip')

## Remove unmapped values
clinical_df = clinical_df[clinical_df["conditionMeshMainBranch"] != "Unknown"]
mesh_df = clinical_df[["NCTId", "Condition", "ConditionMesh", "conditionMesh", "conditionMeshMainBranch"]]
mesh_df.to_csv("./dataOutput/MeSH_validation.csv")

