In [6]:
import pandas as pd
import json
import spacy
import requests

In [41]:

# List of study identifiers (NCT numbers)
# study_identifiers = ["NCT04550221", "NCT03875950","NCT03574129", "NCT03054051","NCT03049917","NCT02915367","NCT02735642","NCT02400671","NCT04736316",
# "NCT03928418","NCT03718871","NCT03435497","NCT03600142"] 

study_identifiers = ['NCT05882916', 'NCT05862857', 'NCT05845619', 'NCT05842122', 'NCT05768763', 'NCT05634265', 'NCT05599581', 'NCT05545449', 'NCT05467306', 'NCT05383755', 
                     'NCT05285670', 'NCT05219552', 'NCT05147519', 'NCT04982250', 'NCT04772469', 'NCT04588883', 'NCT04550221', 'NCT04472884', 'NCT04437667', 'NCT04432571', 
                     'NCT03988387', 'NCT03876483', 'NCT03875950', 'NCT03574129', 'NCT03447210', 'NCT03435887', 'NCT03342027', 'NCT03054051', 'NCT03049917', 'NCT03030768', 
                     'NCT02931422', 'NCT02928900', 'NCT02915367', 'NCT02735642', 'NCT02726607', 'NCT02718456', 'NCT02627365', 'NCT02527135', 'NCT02474992', 'NCT02400671', 
                     'NCT02338739', 'NCT02320799', 'NCT01962220', 'NCT01947764', 'NCT01912521', 'NCT01876199', 'NCT01850576', 'NCT01784783', 'NCT01756469', 'NCT01645865', 
                     'NCT01630304', 'NCT01557998', 'NCT01571128', 'NCT01503255', 'NCT01501864', 'NCT01157442', 'NCT01058694', 'NCT00792519', 'NCT00273780', 'NCT00241202', 
                     'NCT00194545', 'NCT00146380', 'NCT05525533', 'NCT05373095', 'NCT05374109', 'NCT05357144', 'NCT05306938', 'NCT05271903', 'NCT05248100', 'NCT05033002', 
                     'NCT04863898', 'NCT04696861', 'NCT04071873', 'NCT03600142', 'NCT03454373', 'NCT03098693', 'NCT02938533', 'NCT02888288', 'NCT02714140', 'NCT03023033', 
                     'NCT02376348', 'NCT02281578', 'NCT02018978', 'NCT01746758', 'NCT01693458', 'NCT00941876', 'NCT00631384', 'NCT00248469', 'NCT00203749', 'NCT05862857', 
                     'NCT05947539', 'NCT05771519', 'NCT05688709', 'NCT05685498', 'NCT05597865', 'NCT05600621', 'NCT05378607', 'NCT05307250', 'NCT05178979', 'NCT05131165', 
                     'NCT05124665', 'NCT05098015', 'NCT05084716', 'NCT04946071', 'NCT04774666', 'NCT04624061', 'NCT04528732', 'NCT04736316', 'NCT04286282', 'NCT04122144', 
                     'NCT04030520', 'NCT03915899', 'NCT03928418', 'NCT03916783', 'NCT03919695', 'NCT03878147', 'NCT03832530', 'NCT03648931', 'NCT03583541', 'NCT03718871', 
                     'NCT03494777', 'NCT03484533', 'NCT03492216', 'NCT03435497', 'NCT03386578', 'NCT03307226', 'NCT03315962', 'NCT02964169', 'NCT02890459', 'NCT02775357', 
                     'NCT02729337', 'NCT02556957', 'NCT02702895', 'NCT02545673', 'NCT02503072', 'NCT02497456', 'NCT02438930', 'NCT02396394', 'NCT02050763', 'NCT02038582', 
                     'NCT01971710', 'NCT01882998', 'NCT01802736', 'NCT01790373', 'NCT01773642', 'NCT01640561', 'NCT01447615', 'NCT01366690', 'NCT01144234', 'NCT00972192', 
                     'NCT00926003', 'NCT00889395', 'NCT00790959', 'NCT00648232']

# Initialize an empty list to store the extracted data
all_extracted_data = []
# Loop through each study identifier and retrieve data
for identifier in study_identifiers:
    url =  f"https://classic.clinicaltrials.gov/api/query/study_fields?expr={identifier}&fields=LeadSponsorName,InterventionDescription,BaselineMeasurePopulationDescription,DesignInterventionModelDescription,InterventionName,PrimaryOutcomeDescription&fmt=JSON"
    
    response = requests.get(url)  
    
    if response.status_code == 200:
        data = json.loads(response.text)
        
        extracted_data = {
            "NCTNumber": identifier,
            "LeadSponsorName": data["StudyFieldsResponse"]["StudyFields"][0]["LeadSponsorName"][0],
            "InterventionDescription": data["StudyFieldsResponse"]["StudyFields"][0]["InterventionDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("InterventionDescription") else None,
            "BaselineMeasurePopulationDescription": data["StudyFieldsResponse"]["StudyFields"][0]["BaselineMeasurePopulationDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("BaselineMeasurePopulationDescription") else None,
            "DesignInterventionModelDescription": data["StudyFieldsResponse"]["StudyFields"][0]["DesignInterventionModelDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("DesignInterventionModelDescription") else None,
            "InterventionName": data["StudyFieldsResponse"]["StudyFields"][0]["InterventionName"][0],
            "PrimaryOutcomeDescription": data["StudyFieldsResponse"]["StudyFields"][0]["PrimaryOutcomeDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("PrimaryOutcomeDescription") else None,
        }
        all_extracted_data.append(extracted_data)
    else:
        print(f"Failed to retrieve data for {identifier} with status code {response.status_code}")

In [42]:
# Once you have collected the data for all study identifiers, you can convert it into a Pandas DataFrame for further analysis or manipulation.
df = pd.DataFrame(all_extracted_data)
# # Save the data to a CSV file if needed
# df.to_csv("clinical_trial_data.csv", index=False)

# # You can also perform various data analysis and manipulation tasks with the DataFrame
print(df.head())



     NCTNumber                          LeadSponsorName  \
0  NCT05882916                            Sue Napierala   
1  NCT05862857  University of California, San Francisco   
2  NCT05845619  University of California, San Francisco   
3  NCT05842122            Fred Hutchinson Cancer Center   
4  NCT05768763  University of California, San Francisco   

                             InterventionDescription  \
0  Oral fluid-based HIV self-test kits for second...   
1  Patrons and employees of drinking venues that ...   
2  The pilot intervention will include the follow...   
3  Services delivered: 1) behavioral HIV risk ass...   
4  We will conduct a series of community engageme...   

  BaselineMeasurePopulationDescription  \
0                                 None   
1                                 None   
2                                 None   
3                                 None   
4                                 None   

                  DesignInterventionModelDescription  \

In [43]:
df.shape

(154, 7)

In [44]:
# let us assess the Primary Outcomes Description of each trial
# Extract all Pri outcomes into one variable
pri_outcome = df['PrimaryOutcomeDescription']

# sentences = ' '.join(pri_outcome.astype(str))

sentences = pri_outcome
print(pri_outcome)

0      The count of adult men from intervention versu...
1      The proportion of HIV-negative adults, receivi...
2      Plasma HIV RNA <50 copies/mL. Data will be col...
3      Number of participants that initiated (i.e., w...
4                     HIV incidences per 100 person-year
                             ...                        
149                                                 None
150    Kaufman Assessment Battery for Children, 2nd e...
151                                                 None
152                                                 None
153                                                 None
Name: PrimaryOutcomeDescription, Length: 154, dtype: object


In [5]:
# Extracting the words from the text is done via tokenization,

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
vocabulary = set([w for s in tokenized_sentences for w in s])

pd.DataFrame([[w, i] for i,w in enumerate(vocabulary)])

# the words have been numbered according to their first occurrence

Unnamed: 0,0,1
0,whether,0
1,encounters,1
2,exit,2
3,technology,3
4,5,4
...,...,...
312,90-day,312
313,taking,313
314,PrEP,314
315,from,315


In [23]:
# BAG OF WORD MODELS
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [24]:
# CountVectorizer works in two distinct phases: first it has to learn the vocabulary; afterward it can transform the documents to vectors.
cv.fit(sentences)

In [25]:
CountVectorizer (analyzer='word',
                binary=False,
                decode_error='strict',
                dtype='<class numpy.int64>', 
                encoding='utf-8', 
                input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

print(cv.get_feature_names_out())

['100' '1000' '126' '13' '14' '150' '21' '22' '28' '60' '71' '90' 'about'
 'acceptability' 'accepted' 'across' 'adapted' 'adherence' 'adolescent'
 'adult' 'after' 'aids' 'alcohol' 'all' 'among' 'an' 'and' 'another'
 'area' 'arm' 'arms' 'art' 'as' 'assess' 'assessed' 'assessment' 'at'
 'attainment' 'attend' 'attendance' 'attended' 'average' 'back' 'based'
 'bayer' 'be' 'behaviors' 'between' 'binomial' 'by' 'call' 'care'
 'caregivers' 'cclad' 'charge' 'checklist' 'child' 'choosing' 'clinic'
 'clinical' 'combined' 'communication' 'compared' 'comparison'
 'comparisons' 'competency' 'complete' 'composite' 'computed' 'conducting'
 'consensus' 'continuous' 'control' 'day' 'days' 'defined' 'delivered'
 'diagnosed' 'divided' 'domains' 'doses' 'drinking' 'each' 'early'
 'eligibility' 'eligible' 'encounters' 'enrollment' 'equations'
 'estimating' 'evaluate' 'examined' 'exit' 'facilities' 'failure'
 'families' 'feasibility' 'feedback' 'female' 'fetzer' 'first' 'follow'
 'for' 'from' 'gap' 'gee' 'g

In [26]:
# Transforming the document into vectors
# we will use CountVectorizer to transform the documents to the vector representation:
vt = cv.transform(sentences)

vt

<13x272 sparse matrix of type '<class 'numpy.int64'>'
	with 421 stored elements in Compressed Sparse Row format>

In [27]:
pd.DataFrame(vt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,100,1000,126,13,14,150,21,22,28,60,...,was,were,whether,who,will,with,within,women,yes,young
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,2,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,0,...,1,3,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,1
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
print(df['InterventionDescription'])

0     The components of this intervention are: four ...
1     This intervention is a clinician training usin...
2     Clinic level intervention composed of an adole...
3     Tumaini is a scenario-based role-playing game ...
4            Conditional cash transfer upon HIV testing
5     Participants will receive daily reminders, wit...
6     Participants will be randomized to receive sta...
7                                                  None
8     Community-based client led antiretroviral ther...
9     Brief alcohol reduction counseling is provided...
10    HIV 1/2 antigen-antibody POC test (Oraquick©) ...
11    Game Changers is an intervention that aims to ...
12    Maisha is a brief, scalable, theory-based coun...
Name: InterventionDescription, dtype: object


In [38]:
# Calculating similarity
# Cosine similarity

# Scikit-learn simplifies this calculation by offering a cosine_similarity utility function.
# Let’s check the similarity of the first two sentences:

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(vt[0], vt[1])

array([[0.15762498]])

In [39]:
pd.DataFrame(cosine_similarity(vt, vt))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.157625,0.096463,0.285738,0.082409,0.137649,0.0,0.0,0.076472,0.097823,0.110294,0.064889,0.128499
1,0.157625,1.0,0.450007,0.311182,0.256296,0.211407,0.209748,0.176657,0.070469,0.236628,0.319428,0.2915,0.236823
2,0.096463,0.450007,1.0,0.297556,0.179357,0.073582,0.178788,0.117119,0.0,0.190495,0.317658,0.200689,0.176633
3,0.285738,0.311182,0.297556,1.0,0.325381,0.135873,0.299572,0.147181,0.15097,0.265543,0.404379,0.208167,0.206116
4,0.082409,0.256296,0.179357,0.325381,1.0,0.215526,0.30548,0.300167,0.059868,0.229752,0.444072,0.2286,0.326947
5,0.137649,0.211407,0.073582,0.135873,0.215526,1.0,0.075593,0.074278,0.0,0.04264,0.206041,0.141421,0.140028
6,0.0,0.209748,0.178788,0.299572,0.30548,0.075593,1.0,0.175466,0.0,0.201456,0.40236,0.240535,0.158777
7,0.0,0.176657,0.117119,0.147181,0.300167,0.074278,0.175466,1.0,0.0,0.118771,0.242319,0.131306,0.20802
8,0.076472,0.070469,0.0,0.15097,0.059868,0.0,0.0,0.0,1.0,0.071067,0.022893,0.0,0.093352
9,0.097823,0.236628,0.190495,0.265543,0.229752,0.04264,0.201456,0.118771,0.071067,1.0,0.395353,0.180907,0.14927


In [32]:
# TF IDF

# a better measure for information compared to counting is calculating the inverted document frequency and using a penalty for very common words.
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()

tfidf_dt = tfidf.fit_transform(vt)

pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,100,1000,126,13,14,150,21,22,28,60,...,was,were,whether,who,will,with,within,women,yes,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.242168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.065011,0.0,0.0,0.065011,0.0,0.0,0.0,0.0,0.065011,0.0,...,0.0,0.0,0.0,0.0,0.081533,0.081533,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066594,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041759,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.145914,0.0,0.0,0.145914,0.0,0.0,0.0,0.0,...,0.111582,0.377492,0.0,0.125831,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.20535,0.149321,0.0,0.20535,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.184773,0.0,0.0,0.0,0.0,0.151516,0.0,0.241625,0.0,0.241625
7,0.0,0.208445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.13071,0.13071,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359747,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.264235,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
## NER
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [22]:
docs = ' '.join(pri_outcome.astype(str))

doc = nlp(docs)

for ent in doc.ents:
     print(ent.text, "|", ent.label_, "|",spacy.explain(ent.label_))

n=71 SP | ORG | Companies, agencies, institutions, etc.
7-28 | CARDINAL | Numerals that do not fall under another type
the Kenyan National AIDS & STI | ORG | Companies, agencies, institutions, etc.
0-13 | CARDINAL | Numerals that do not fall under another type
PrEP | PERSON | People, including fictional
100 | CARDINAL | Numerals that do not fall under another type
SP | NORP | Nationalities or religious or political groups
the "Taking Charge | ORG | Companies, agencies, institutions, etc.
4 | CARDINAL | Numerals that do not fall under another type
5 | DATE | Absolute or relative dates or periods
9 | DATE | Absolute or relative dates or periods
5 | DATE | Absolute or relative dates or periods
3 | CARDINAL | Numerals that do not fall under another type
22 | CARDINAL | Numerals that do not fall under another type
0 | CARDINAL | Numerals that do not fall under another type
150 | CARDINAL | Numerals that do not fall under another type
126 | CARDINAL | Numerals that do not fall under another 