In [None]:
import json, os
from datetime import datetime
from datetime import date

from tqdm import tqdm
import numpy as np
import pickle
from matplotlib import pyplot as plt

import nest_asyncio
import asyncio
nest_asyncio.apply()

from sentence_transformers import SentenceTransformer
from transformers import pipeline
import psutil

from data.models import DrugLabel, ProductSection
from data.util import compute_section_embedding

In [None]:
# Some M1 ARM architecture issues with transformers / tokenizers
# Ended up installing django-extensions and running Jupyter with shell-plus from the venv
# Tried to override Django settings as Jupyter was using the Docker networking, so `postgres` wasn't resolving, but this didn't work
# Added /etc/hosts mapping `127.0.0.1 postgres` so this would work on my local, see https://github.com/instructure/lti_tool_provider_example/issues/4
# See: https://gist.github.com/EtsuNDmA/dd8949061783bf593706559374c8f635
# See: https://stackoverflow.com/questions/61926359/django-synchronousonlyoperation-you-cannot-call-this-from-an-async-context-u

# os.environ["DJANGO_SETTINGS_MODULE"] = "dle.settings"
# os.environ.get("DJANGO_SETTINGS_MODULE")
# import os, sys
# import django
# PROJECTPATH = '/Users/colecrawford/GitHub/dle'
# sys.path.insert(0, PROJECTPATH)
# os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
# os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"  # https://docs.djangoproject.com/en/4.1/topics/async/#async-safety
# os.chdir(PROJECTPATH)
# django.setup()

In [None]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [None]:
from api.apps import ApiConfig
# pubmedbert_model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
pubmedbert_model = ApiConfig.pubmedbert_model

In [None]:
sections = ProductSection.objects.filter(label_product__drug_label__source="TGA").all()
print(sections.count())
subset = sections[0:500]
print(subset.count())

In [None]:
start = datetime.now()

for section in tqdm(subset):
    section.bert_vector = json.dumps(compute_section_embedding(section.section_text, model=pubmedbert_model, normalize=True))
    section.save()
    
end = datetime.now()
elapsed = end - start
print(f" ------------- vectorized {subset.count()} sections in { int(elapsed.total_seconds()) } seconds")
print(f"{int(elapsed.total_seconds()) / subset.count() } seconds per section")

In [None]:
sections_without_vectors = ProductSection.objects.filter(
    label_product__drug_label__source="TGA"
).filter(bert_vector__isnull=True)

In [None]:
for section in tqdm(sections_without_vectors):
    section.bert_vector = json.dumps(compute_section_embedding(section.section_text, model=pubmedbert_model, normalize=True))
    section.save()

In [None]:
from elasticsearch import logger as es_logger
import logging
es_logger.setLevel(logging.ERROR)

In [None]:
# Vectorize EMA labels
# Get all the DrugLabels first
ema_vectors = {}
ema_labels = DrugLabel.objects.filter(source="EMA")
print(f"{ema_labels.count()} EMA labels")

# prep the dict of dicts of dicts
for dl in ema_labels:
    if dl.source_product_number not in ema_labels:
        ema_vectors[dl.source_product_number] = {}
    ema_vectors[dl.source_product_number][dl.version_date.strftime("%Y/%m/%d")] = {}
                

ema_sections_without_vectors = ProductSection.objects.filter(
    label_product__drug_label__source="EMA"
).filter(bert_vector__isnull=True)

for section in tqdm(ema_sections_without_vectors):
    vec = compute_section_embedding(section.section_text, model=pubmedbert_model, normalize=True)
    # get the section's source_product_number and date
    spn = section.label_product.drug_label.source_product_number
    vd = section.label_product.drug_label.version_date
    ema_vectors[spn][vd] = json.dumps(vec)

In [None]:
# without Asyncio: 6282 sections in 10:55, or 9.58it/s

In [None]:
# Vectorize EMA labels - with Asyncio
ema_vectors = {}
ema_labels = DrugLabel.objects.filter(source="EMA")
print(f"{ema_labels.count()} EMA labels")

# prep the dict of dicts of dicts
for dl in ema_labels:
    if dl.source_product_number not in ema_labels:
        ema_vectors[dl.source_product_number] = {}
    ema_vectors[dl.source_product_number][dl.version_date.strftime("%Y/%m/%d")] = {}
                

ema_sections_without_vectors = ProductSection.objects.filter(
    label_product__drug_label__source="EMA"
).filter(bert_vector__isnull=True)

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def compute_section_vector_wrapper(section):
    vec = compute_section_embedding(text=section.section_text, model=pubmedbert_model, normalize=True)
    spn = section.label_product.drug_label.source_product_number
    vd = section.label_product.drug_label.version_date.strftime("%Y/%m/%d")
    ema_vectors[spn][vd][section.section_name] = json.dumps(vec)

start = datetime.now()
loop = asyncio.get_event_loop()
looper = asyncio.gather(*[compute_section_vector_wrapper(s) for s in ema_sections_without_vectors])
results = loop.run_until_complete(looper)
end = datetime.now()
elapsed = end - start

print(f"finished computing ------------- { int(elapsed.total_seconds()) } seconds")
print(f"{ema_labels.count()} drug labels processed: { int(elapsed.total_seconds()) / ema_labels.count() } seconds per drug")
print(f"{ema_sections_without_vectors.count()} sections processed: { int(elapsed.total_seconds()) / ema_sections_without_vectors.count() } seconds per section")

In [None]:
# Serializing json
json_object = json.dumps(ema_vectors)
 
# Writing to ema_vectors.json
with open("output/ema_vectors.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
vec_secs_saved = 0
for label in ema_vectors.keys():
    for date in ema_vectors[label].keys():
        vec_secs_saved += len(ema_vectors[label][date].keys())

In [None]:
tga_sections[0]

In [None]:
# Vectorize TGA labels - with Asyncio
tga_vectors = {}
tga_labels = DrugLabel.objects.filter(source="TGA")
print(f"{tga_labels.count()} TGA labels")

# prep the dict of dicts of dicts
for dl in tga_labels:
    spn = dl.source_product_number
    vd = dl.version_date.strftime("%Y/%m/%d")
    if spn not in tga_labels:
        tga_vectors[spn] = {}
    tga_vectors[spn][vd] = {}

print("created tga_vectors dict")

# tga_sections_without_vectors = ProductSection.objects.filter(
#     label_product__drug_label__source="TGA"
# ).filter(bert_vector__isnull=True)

tga_sections = ProductSection.objects.filter(
    label_product__drug_label__source="TGA"
)

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def compute_section_vector_wrapper(section):
    vec = compute_section_embedding(text=section.section_text, model=pubmedbert_model, normalize=True)
    section.bert_vector = json.dumps(vec)
    # if this works we're golden
    section.save()
    # otherwise do this too
    spn = section.label_product.drug_label.source_product_number
    vd = section.label_product.drug_label.version_date.strftime("%Y/%m/%d")
    section_name = section.section_name
    try:
        tga_vectors[spn][vd][section_name] = json.dumps(vec)
    except KeyError:
        print(f"KeyError: tga_vectors[{spn}][{vd}][{section_name}]")
        print(tga_vectors[spn])
        print(tga_vectors[spn][vd])

print("Starting Asyncio vectorization")
start = datetime.now()
loop = asyncio.get_event_loop()
# looper = asyncio.gather(*[compute_section_vector_wrapper(s) for s in tga_sections_without_vectors])
looper = asyncio.gather(*[compute_section_vector_wrapper(s) for s in tga_sections])
results = loop.run_until_complete(looper)
end = datetime.now()
elapsed = end - start

print(f"finished computing ------------- { int(elapsed.total_seconds()) } seconds")
print(f"{tga_labels.count()} drug labels processed: { int(elapsed.total_seconds()) / tga_labels.count() } seconds per drug")
print(f"{tga_sections.count()} sections processed: { int(elapsed.total_seconds()) / tga_sections.count() } seconds per section")

# Serializing json
tga_json = json.dumps(tga_vectors)
 
# Writing out to file
with open("output/tga_vectors.json", "w") as outfile:
    outfile.write(tga_json)
    
section_vectors_saved = 0
for label in tga_vectors.keys():
    for date in tga_vectors[label].keys():
        vec_secs_saved += len(tga_vectors[label][date].keys())

In [None]:
# Serializing json
tga_json = json.dumps(tga_vectors)
 
# Writing out to file
with open("data/output/tga_vectors.json", "w") as outfile:
    outfile.write(tga_json)
    
section_vectors_saved = 0
for label in tga_vectors.keys():
    for date in tga_vectors[label].keys():
        section_vectors_saved += len(tga_vectors[label][date].keys())
print(section_vectors_saved)

In [None]:
# Trying to figure out whether there are duplicate source_product_numbers, seems like some but not sure if that's from multiple ingests?
all_tga = DrugLabel.objects.filter(source="TGA")
print(all_tga.count())
from collections import Counter
source_product_numbers = []
for dl in all_tga:
    source_product_numbers.append(dl.source_product_number)
counts = Counter(source_product_numbers)

In [None]:
counts

In [None]:
some_dupes = ['CP-2023-PI-01419-1', 'CP-2010-PI-03832-3', 'CP-2010-PI-02591-3']
for dupe_label in some_dupes:
    labels = DrugLabel.objects.filter(source_product_number=dupe_label)
    for dupe in labels:
        print(dupe.id)
        print(dupe)
    print("----")

In [None]:
ingest_327 = DrugLabel.objects.filter(source="TGA", version_date__lte=date(2023, 3, 27))
ingest_327.delete()

In [None]:
vals = counts.values()
Counter(vals)

In [None]:
# Trying to figure out whether there are duplicate source_product_numbers, seems like some but not sure if that's from multiple ingests?
all_fda = DrugLabel.objects.filter(source="FDA")
print(all_fda.count())
from collections import Counter
source_product_numbers_fda = []
for dl in all_fda:
    source_product_numbers_fda.append(dl.source_product_number)
fda_counts = Counter(source_product_numbers_fda)
fda_counts_of_counts = Counter(fda_counts.values())

In [None]:
sorted(fda_counts_of_counts.items(), key=lambda i: i[1])

In [None]:
# big duplicate is '0003-0293'
fda_dupe_labels = DrugLabel.objects.filter(source_product_number='0003-0293')
for dupe in fda_dupe_labels:
        print(dupe.version_date)
        print(f"----")

In [None]:
examine = []
for key in fda_counts.keys():
    if fda_counts[key] > 1:
        # ensure all the version_dates are unique
        versions = DrugLabel.objects.filter(source_product_number=key)
        version_dates = []
        for v in versions:
            version_dates.append(v.version_date)
        c = Counter(version_dates)
        if c.most_common(1)[0][1] > 1:
            examine.append(key)

In [None]:
# Trying to figure out whether there are duplicate source_product_numbers, seems like some but not sure if that's from multiple ingests?
all_ema = DrugLabel.objects.filter(source="EMA")
print(all_ema.count())
source_product_numbers_ema = []
for dl in all_ema:
    source_product_numbers_ema.append(dl.source_product_number)
ema_counts = Counter(source_product_numbers_ema)

examine_ema = []
for key in ema_counts.keys():
    if ema_counts[key] > 1:
        # ensure all the version_dates are unique
        versions = DrugLabel.objects.filter(source_product_number=key)
        version_dates = []
        for v in versions:
            version_dates.append(v.version_date)
        c = Counter(version_dates)
        if c.most_common(1)[0][1] > 1:
            examine_ema.append(key)
print(examine_ema)

In [None]:
version_dates = []
for v in versions:
    version_dates.append(v.version_date)
c = Counter(version_dates)
c.most_common(1)[0][1]

In [None]:
from elasticsearch_django.settings import get_client
es = get_client()

In [None]:
vectorized = ProductSection.objects.filter(label_product__drug_label__source="TGA").filter(bert_vector__isnull=False)
for section in tqdm(vectorized):
    es.index(index="productsection", document=section.as_search_document(), id=section.id)

In [None]:
list(model.encode("brain bleeding"))

In [None]:
data_file_fda = 'data/output/human-rx-openfda-drug.json'
RUN_DIAGNOSTIC = False

In [None]:
with open(data_file_fda) as f:
    data_fda = json.load(f)

In [None]:
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [None]:
keys = list(data_fda.keys())
print(len(keys))
k = keys[0]
print(data_fda[k].keys())
print(data_fda[k]['metadata'].keys())

sample_keys = keys[0:500]

In [None]:
keys_fda, drugs_fda = zip(*data_fda.items())
sections_fda = [d['Label Text'].keys() for d in drugs_fda]
sections_fda = sorted(set([s for slist in sections_fda for s in slist]))

In [None]:
sections_fda

In [None]:
def compute_section_embedding(text, word_count=256):
    n_segments = 1 + len(text.split()) // word_count
    print(n_segments)
    vecs = np.zeros((n_segments,768))
    for i in range(n_segments):
        segment = text.split()[ (i)*word_count : (i+1)*word_count ]
        print(segment)
        vecs[i,:] = model.encode( ' '.join(segment) )
    return np.mean(vecs, axis=0)

In [None]:
!pip install ray

In [None]:
num_cpus = psutil.cpu_count(logical=True)
print('Number of available CPUs:', num_cpus)

ray.init(num_cpus=num_cpus, ignore_reinit_error=True)

pipe = pipeline(task = 'feature-extraction', model='pritamdeka/S-PubMedBert-MS-MARCO', batch_size=1, device=-1)

pipe_id = ray.put(pipe)

@ray.remote
def vectorize(pipeline, 

In [None]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def compute_vector_wrapper(key):
    drug = data_fda[key]
    sections = drug['Label Text']
    print(key)
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])

vectors = {}
start = datetime.now()
loop = asyncio.get_event_loop()
looper = asyncio.gather(*[compute_vector_wrapper(key) for key in sample_keys])
results = loop.run_until_complete(looper)
end = datetime.now()
elapsed = end - start

total_sections = 0
for key in sample_keys:
    total_sections += len(data_fda[key]['Label Text'])
print(f"fin ------------- { int(elapsed.total_seconds()) } seconds")
print(f"{len(sample_keys)} drug labels processed: { int(elapsed.total_seconds()) / len(sample_keys) } seconds per drug")
print(f"{total_sections} sections processed: { int(elapsed.total_seconds()) / total_sections } seconds per section")

In [None]:
# AYSNCIO Tests
# 100 drug labels processed: 2.85 seconds per drug
# 2223 sections processed: 0.1282051282051282 seconds per section

# 500 drug labels processed: 3.202 seconds per drug
# 11593 sections processed: 0.13810057793496075 seconds per section

In [None]:
vectors

In [None]:
vectors = {}
section_count = 0
start = datetime.now()
for key in tqdm.tqdm(sample_keys):
    drug = data_fda[key]
    sections = drug['Label Text']
    # print(key)
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])
            section_count += 1
            
end = datetime.now()
elapsed = end - start

print(f"{len(sample_keys)} drug labels processed: { int(elapsed.total_seconds()) / len(sample_keys) } seconds per drug")
print(f"{section_count} sections processed: { int(elapsed.total_seconds()) / section_count } seconds per section")

In [None]:
from multiprocessing import Pool
import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_vector_wrapper(key):
    drug = data_fda[key]
    sections = drug['Label Text']
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])
    print(f"{key} completed", flush=True)

with Pool(4) as pool:
     tqdm(pool.imap(compute_vector_wrapper, sample_keys))

In [None]:
print(section_count)

In [None]:
len(vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements'])

In [None]:
test_vector = vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements']
test_vector_list = test_vector.tolist()
json_vector = json.dumps(test_vector_list)
json.loads(json_vector)

In [None]:
print(f"num subsections: {len(data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']['Label Text']['warnings'])}")
data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']['Label Text']['warnings']

In [None]:
data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']

In [None]:
type(vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements'])

In [None]:
!pip list