# Setting Up Word2vec project
# This Word2vec project is meant to cover word similaritity in Behavioural Interventions study descriptions. A focus on HIV/AIDS

In [13]:
# Import packages
import pandas as pd
import json
import spacy
import requests
import numpy as np
from tqdm import tqdm
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
import string


In [2]:
# Extract Trial ID codes
trials = 'NCT05882916,NCT05862857,NCT05845619,NCT05842122,NCT05768763,NCT05634265,NCT05599581,NCT05545449,NCT05467306,NCT05383755,NCT05285670,NCT05219552,NCT05147519,NCT04982250,NCT04772469,NCT04588883,NCT04550221,NCT04472884,NCT04437667,NCT04432571,NCT03988387,NCT03876483,NCT03875950,NCT03574129,NCT03447210,NCT03435887,NCT03342027,NCT03054051,NCT03049917,NCT03030768,NCT02931422,NCT02928900,NCT02915367,NCT02735642,NCT02726607,NCT02718456,NCT02627365,NCT02527135,NCT02474992,NCT02400671,NCT02338739,NCT02320799,NCT01962220,NCT01947764,NCT01912521,NCT01876199,NCT01850576,NCT01784783,NCT01756469,NCT01645865,NCT01630304,NCT01557998,NCT01571128,NCT01503255,NCT01501864,NCT01157442,NCT01058694,NCT00792519,NCT00273780,NCT00241202,NCT00194545,NCT00146380,NCT05525533,NCT05373095,NCT05374109,NCT05357144,NCT05306938,NCT05271903,NCT05248100,NCT05033002,NCT04863898,NCT04696861,NCT04071873,NCT03600142,NCT03454373,NCT03098693,NCT02938533,NCT02888288,NCT02714140,NCT03023033,NCT02376348,NCT02281578,NCT02018978,NCT01746758,NCT01693458,NCT00941876,NCT00631384,NCT00248469,NCT00203749,NCT05862857,NCT05947539,NCT05771519,NCT05688709,NCT05685498,NCT05597865,NCT05600621,NCT05378607,NCT05307250,NCT05178979,NCT05131165,NCT05124665,NCT05098015,NCT05084716,NCT04946071,NCT04774666,NCT04624061,NCT04528732,NCT04736316,NCT04286282,NCT04122144,NCT04030520,NCT03915899,NCT03928418,NCT03916783,NCT03919695,NCT03878147,NCT03832530,NCT03648931,NCT03583541,NCT03718871,NCT03494777,NCT03484533,NCT03492216,NCT03435497,NCT03386578,NCT03307226,NCT03315962,NCT02964169,NCT02890459,NCT02775357,NCT02729337,NCT02556957,NCT02702895,NCT02545673,NCT02503072,NCT02497456,NCT02438930,NCT02396394,NCT02050763,NCT02038582,NCT01971710,NCT01882998,NCT01802736,NCT01790373,NCT01773642,NCT01640561,NCT01447615,NCT01366690,NCT01144234,NCT00972192,NCT00926003,NCT00889395,NCT00790959,NCT00648232'

modified_string = trials.replace(",", "','")

# Now convert this string into a list by splitting on "','"
string_list = modified_string.split("','")

# Print the list
print(string_list)

['NCT05882916', 'NCT05862857', 'NCT05845619', 'NCT05842122', 'NCT05768763', 'NCT05634265', 'NCT05599581', 'NCT05545449', 'NCT05467306', 'NCT05383755', 'NCT05285670', 'NCT05219552', 'NCT05147519', 'NCT04982250', 'NCT04772469', 'NCT04588883', 'NCT04550221', 'NCT04472884', 'NCT04437667', 'NCT04432571', 'NCT03988387', 'NCT03876483', 'NCT03875950', 'NCT03574129', 'NCT03447210', 'NCT03435887', 'NCT03342027', 'NCT03054051', 'NCT03049917', 'NCT03030768', 'NCT02931422', 'NCT02928900', 'NCT02915367', 'NCT02735642', 'NCT02726607', 'NCT02718456', 'NCT02627365', 'NCT02527135', 'NCT02474992', 'NCT02400671', 'NCT02338739', 'NCT02320799', 'NCT01962220', 'NCT01947764', 'NCT01912521', 'NCT01876199', 'NCT01850576', 'NCT01784783', 'NCT01756469', 'NCT01645865', 'NCT01630304', 'NCT01557998', 'NCT01571128', 'NCT01503255', 'NCT01501864', 'NCT01157442', 'NCT01058694', 'NCT00792519', 'NCT00273780', 'NCT00241202', 'NCT00194545', 'NCT00146380', 'NCT05525533', 'NCT05373095', 'NCT05374109', 'NCT05357144', 'NCT05306

In [3]:
# Extracted Trial ID identifiers 
study_identifiers = ['NCT05882916', 'NCT05862857', 'NCT05845619', 'NCT05842122', 'NCT05768763', 'NCT05634265', 'NCT05599581', 'NCT05545449', 'NCT05467306', 'NCT05383755', 
                     'NCT05285670', 'NCT05219552', 'NCT05147519', 'NCT04982250', 'NCT04772469', 'NCT04588883', 'NCT04550221', 'NCT04472884', 'NCT04437667', 'NCT04432571', 
                     'NCT03988387', 'NCT03876483', 'NCT03875950', 'NCT03574129', 'NCT03447210', 'NCT03435887', 'NCT03342027', 'NCT03054051', 'NCT03049917', 'NCT03030768', 
                     'NCT02931422', 'NCT02928900', 'NCT02915367', 'NCT02735642', 'NCT02726607', 'NCT02718456', 'NCT02627365', 'NCT02527135', 'NCT02474992', 'NCT02400671', 
                     'NCT02338739', 'NCT02320799', 'NCT01962220', 'NCT01947764', 'NCT01912521', 'NCT01876199', 'NCT01850576', 'NCT01784783', 'NCT01756469', 'NCT01645865', 
                     'NCT01630304', 'NCT01557998', 'NCT01571128', 'NCT01503255', 'NCT01501864', 'NCT01157442', 'NCT01058694', 'NCT00792519', 'NCT00273780', 'NCT00241202', 
                     'NCT00194545', 'NCT00146380', 'NCT05525533', 'NCT05373095', 'NCT05374109', 'NCT05357144', 'NCT05306938', 'NCT05271903', 'NCT05248100', 'NCT05033002', 
                     'NCT04863898', 'NCT04696861', 'NCT04071873', 'NCT03600142', 'NCT03454373', 'NCT03098693', 'NCT02938533', 'NCT02888288', 'NCT02714140', 'NCT03023033', 
                     'NCT02376348', 'NCT02281578', 'NCT02018978', 'NCT01746758', 'NCT01693458', 'NCT00941876', 'NCT00631384', 'NCT00248469', 'NCT00203749', 'NCT05862857', 
                     'NCT05947539', 'NCT05771519', 'NCT05688709', 'NCT05685498', 'NCT05597865', 'NCT05600621', 'NCT05378607', 'NCT05307250', 'NCT05178979', 'NCT05131165', 
                     'NCT05124665', 'NCT05098015', 'NCT05084716', 'NCT04946071', 'NCT04774666', 'NCT04624061', 'NCT04528732', 'NCT04736316', 'NCT04286282', 'NCT04122144', 
                     'NCT04030520', 'NCT03915899', 'NCT03928418', 'NCT03916783', 'NCT03919695', 'NCT03878147', 'NCT03832530', 'NCT03648931', 'NCT03583541', 'NCT03718871', 
                     'NCT03494777', 'NCT03484533', 'NCT03492216', 'NCT03435497', 'NCT03386578', 'NCT03307226', 'NCT03315962', 'NCT02964169', 'NCT02890459', 'NCT02775357', 
                     'NCT02729337', 'NCT02556957', 'NCT02702895', 'NCT02545673', 'NCT02503072', 'NCT02497456', 'NCT02438930', 'NCT02396394', 'NCT02050763', 'NCT02038582', 
                     'NCT01971710', 'NCT01882998', 'NCT01802736', 'NCT01790373', 'NCT01773642', 'NCT01640561', 'NCT01447615', 'NCT01366690', 'NCT01144234', 'NCT00972192', 
                     'NCT00926003', 'NCT00889395', 'NCT00790959', 'NCT00648232']

# Initialize an empty list to store the extracted data
all_extracted_data = []

In [6]:
# Accessing Clinical trials.gov API

# Loop through each study identifier and retrieve data
for identifier in study_identifiers:
    url =  f"https://classic.clinicaltrials.gov/api/query/study_fields?expr={identifier}&fields=LeadSponsorName,InterventionDescription,BaselineMeasurePopulationDescription,DesignInterventionModelDescription,InterventionName,PrimaryOutcomeDescription&fmt=JSON"
    
    response = requests.get(url)  
    
    if response.status_code == 200:
        data = json.loads(response.text)
        
        extracted_data = {
            "NCTNumber": identifier,
            "LeadSponsorName": data["StudyFieldsResponse"]["StudyFields"][0]["LeadSponsorName"][0],
            "InterventionDescription": data["StudyFieldsResponse"]["StudyFields"][0]["InterventionDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("InterventionDescription") else None,
            "BaselineMeasurePopulationDescription": data["StudyFieldsResponse"]["StudyFields"][0]["BaselineMeasurePopulationDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("BaselineMeasurePopulationDescription") else None,
            "DesignInterventionModelDescription": data["StudyFieldsResponse"]["StudyFields"][0]["DesignInterventionModelDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("DesignInterventionModelDescription") else None,
            "InterventionName": data["StudyFieldsResponse"]["StudyFields"][0]["InterventionName"][0],
            "PrimaryOutcomeDescription": data["StudyFieldsResponse"]["StudyFields"][0]["PrimaryOutcomeDescription"][0] if data["StudyFieldsResponse"]["StudyFields"][0].get("PrimaryOutcomeDescription") else None,
        }
        all_extracted_data.append(extracted_data)
    else:
        print(f"Failed to retrieve data for {identifier} with status code {response.status_code}")

In [7]:
# Convert it into a Pandas DataFrame for further analysis or manipulation.
df = pd.DataFrame(all_extracted_data)
# # Save the data to a CSV file 
df.to_csv("clinical_trial_data.csv", index=False)

print(df.head())


     NCTNumber                          LeadSponsorName  \
0  NCT05882916                            Sue Napierala   
1  NCT05862857  University of California, San Francisco   
2  NCT05845619  University of California, San Francisco   
3  NCT05842122            Fred Hutchinson Cancer Center   
4  NCT05768763  University of California, San Francisco   

                             InterventionDescription  \
0  Oral fluid-based HIV self-test kits for second...   
1  Patrons and employees of drinking venues that ...   
2  The pilot intervention will include the follow...   
3  Services delivered: 1) behavioral HIV risk ass...   
4  We will conduct a series of community engageme...   

  BaselineMeasurePopulationDescription  \
0                                 None   
1                                 None   
2                                 None   
3                                 None   
4                                 None   

                  DesignInterventionModelDescription  \

In [8]:
# Text Analysis on Intervention Description 
# One prepare Data
intervention = df['InterventionDescription']

sentences = ' '.join(intervention.astype(str))

#sentences = intervention
print(sentences)

Oral fluid-based HIV self-test kits for secondary distribution Patrons and employees of drinking venues that are randomized to HIV-focused recruitment will receive a recruitment card offering free HIV testing at the local clinic The pilot intervention will include the following components:

More frequent viral load collection: Participants who enroll in the first 3 months of the pilot study period will be eligible for a 3-month follow-up visit and will have viral load assessed again at that time.
Rapid return of viral load results: Mentor mothers (peer counselors) will be trained to return viral load results to patients. Viral loads will will be processed with GeneXpert point of care technology.
Enhanced viral load counseling: Mentor mothers will reinforce adherence with all patients with undetectable levels via scripted messaging to reward and encourage healthy behavior. For those with any detectable levels, mentor mothers will be trained to provide targeted counseling with scripted m

In [32]:
# create a txt file to save the interventions
with open ('intervention.txt','w', encoding='utf-8') as f:
    f.write (sentences)

In [11]:
# stop words txt file
with open('stopwords.txt', encoding='utf-8') as f:
    stopwords = f.read().replace('\n',' ').split()

In [34]:
with open ('intervention.txt', encoding='utf-8') as f:
    text = f.read().replace('\n','')
    print(text)
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.lower().split()
    print(text)

text = [w for w in text if w not in stopwords][:1298]
print(text)
    


Oral fluid-based HIV self-test kits for secondary distribution Patrons and employees of drinking venues that are randomized to HIV-focused recruitment will receive a recruitment card offering free HIV testing at the local clinic The pilot intervention will include the following components:More frequent viral load collection: Participants who enroll in the first 3 months of the pilot study period will be eligible for a 3-month follow-up visit and will have viral load assessed again at that time.Rapid return of viral load results: Mentor mothers (peer counselors) will be trained to return viral load results to patients. Viral loads will will be processed with GeneXpert point of care technology.Enhanced viral load counseling: Mentor mothers will reinforce adherence with all patients with undetectable levels via scripted messaging to reward and encourage healthy behavior. For those with any detectable levels, mentor mothers will be trained to provide targeted counseling with scripted messa

In [28]:
# Data Preparation
WINDOW_SIZE = 3
NUM_NEGATIVE_SAMPLES = 3

data = []

#iterate
for idx, center_word in enumerate (text[WINDOW_SIZE-1:-WINDOW_SIZE]):

    # itwerate over the context words around the centre word
    context_words = [context_word for context_word in text[idx:idx+2*WINDOW_SIZE-1] if context_word != center_word]
    for context_word in context_words:

        #get words NOT in the current context as negative samples
        data.append([center_word, context_word, 1])
        negative_samples = np.random.choice([w for w in text[WINDOW_SIZE -1:-1] if w != center_word and w not in context_word])

        for negative_samp in negative_samples:

            #add atraining row
            data.append([center_word, negative_samp, 0])


In [29]:
df1 = pd.DataFrame(columns=['center_word', 'context_word', 'label'], data=data)
words = np.intersect1d(df1.context_word, df1.center_word)
df1 = df1[(df1.center_word.isin(words)) & (df1.context_word.isin(words)).reset_index(drop=True)]

In [30]:
df1

Unnamed: 0,center_word,context_word,label
16,based,hiv,1
24,based,kits,1
40,hiv,based,1
47,hiv,kits,1
55,hiv,secondary,1
...,...,...,...
41275,4,visit,1
41282,4,8,1
41296,8,visit,1
41307,8,4,1


In [35]:
def sigmoid(v, scale=1):
    return 1/(1 + np.exp(-scale*v))

In [37]:
def update_embeddings(df1, main_embeddings, context_embeddings, learning_rate, debug=False):

    #get differences between main embeddings and corresponding context embeddings
    main_embeddings_center = main_embeddings.loc[df1.center_word].values
    context_embeddings_context = context_embeddings.loc[df1.context_word].values
    diffs = context_embeddings_context - main_embeddings_center

    #get similarities, scores, and errors between main embeddings and corresponding context embeddings
    dot_prods = np.sum(main_embeddings_center * context_embeddings_context, axis=1)
    scores = sigmoid(dot_prods)
    errors = (df1.label - scores).values.reshape(-1,1)

    #calculate updates
    updates = diffs*errors*learning_rate
    updates_df = pd.DataFrame(data=updates)
    updates_df['center_word'] = df1.center_word
    updates_df['context_word'] = df1.context_word
    updates_df_center = updates_df.groupby('center_word').sun()
    updates_df_context = updates_df.groupby('context_word').sum()

    if debug:
        plot_words(debug)

    #apply updates
    main_embeddings += updates_df_center.loc[main_embeddings.index]
    context_embeddings -= updates_df_context.loc[context_embeddings.index]

    #normalize embeddings
    main_embeddings = normalize_data(main_embeddings)
    context_embeddings = normalize_data(context_embeddings)

    #return the updated embeddings
    return main_embeddings, context_embeddings




In [36]:
def normalize_data(data):
    row_norms = np.sqrt((data.values**2).sun(axis=1)).reshape(-1,1)
    return data.divide(row_norms, axis='index')