In [1]:
# Alberto Bejarano (2025)
# 00_openFDA_UMAP_v01
# pip install nbqa flake8
# nbqa flake8 openFDA_UMAP_v06.ipynb --select=F401

In [2]:
# These Jupyter notebooks process FDA adverse event data for a set of drugs. It first retrieves data using an API and extracts relevant details about the types of adverse events
# for each drug. The data is then aggregated, grouped by drug and event, and pivoted into a table format for further analysis. The notebook applies data scaling and dimensionality
# reduction techniques, such as Principal Component Analysis (PCA), to standardize and reduce the dimensionality of the dataset. It uses clustering (Leiden algorithm) to group 
# similar data points and explores the relationships between the drugs and their adverse events. Finally, the notebook visualizes the results using UMAP, providing insights into 
# the patterns and associations between the drugs and the severity of their associated adverse events.

In [3]:
# 00_openFDA_UMAP_v01
# This script processes FDA adverse event data for a selected set of cancer drugs by retrieving records from the openFDA API. It fetches reports per drug in batches, extracting information
# on adverse events and associated seriousness flags (e.g., death, hospitalization). The data is cleaned and standardized by capitalizing drug and event names, and a full list of unique
# adverse events is saved to a text file. The script then aggregates and summarizes the data, computing total counts and percentages of each seriousness category per drug-event pair.
# This normalized dataset allows for comparison of adverse event profiles across drugs. Finally, the processed data is saved as a CSV file.

In [4]:
#import os   # Record the name of the active Conda environment
#env_name = os.getenv('CONDA_DEFAULT_ENV'); print(f"Active Conda environment: {env_name}")
#!conda env export > {env_name}.yml

In [5]:
import time; start = time.time()
from datetime import datetime; print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

April 21, 2025 19:05:29


In [6]:
import warnings
from anndata._core.aligned_df import ImplicitModificationWarning
warnings.simplefilter("ignore", ImplicitModificationWarning)

In [7]:
import requests, time, matplotlib, contextlib, os
from IPython.display import Audio, display
#         .         .         .         .         .         .         .         .
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import product
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns  = 200

In [8]:
import scanpy as sc; sc.settings.seed = 42

In [9]:
total_records = 10000
batch_size    = 1000

In [10]:
url = "https://api.fda.gov/drug/event.json"

In [11]:
#adv_event_filter = ['Febrile neutropenia', 'Diarrhoea', 'Myelosuppression', 'Myalgia']

In [12]:
drug_names = [
    "5-FU", "Abemaciclib", "Abiraterone", "Alectinib", "Anastrozole", "Apalutamide", "Atezolizumab", "Bevacizumab", "Cabazitaxel", "Cabozantinib", "Capecitabine",
    "Carboplatin", "Cetuximab", "Cisplatin", "Dabrafenib", "Docetaxel", "Durvalumab", "Enfortumab Vedotin", "Enzalutamide", "Etoposide", "Exemestane", "Fulvestrant",
    "Gemcitabine", "Goserelin", "Irinotecan", "Kadcyla", "Lenvatinib", "Letrozole", "Leucovorin", "Levothyroxine", "Lorlatinib", "Nivolumab", "Osimertinib",
    "Paclitaxel", "Palbociclib", "Panitumumab", "Pembrolizumab", "Pertuzumab", "Ribociclib", "Regorafenib", "Sorafenib", "Tamoxifen", "T-DM1", "T-DXd", 
    "Trametinib", "Trastuzumab", "Tremelimumab", "Vandetanib"]

In [13]:
drug_names = ["Kadcyla", "Adcetris", "Polivy", "Enhertu", "Trodelvy",
    "Gemtuzumab ozogamicin", "Brentuximab vedotin", "Ado-trastuzumab emtansine", "Inotuzumab ozogamicin", "Moxetumomab pasudotox", "Polatuzumab vedotin",
    "Enfortumab vedotin", "Trastuzumab deruxtecan", "Sacituzumab govitecan", "Belantamab mafodotin", "Loncastuximab tesirine", "Tisotumab vedotin", "Mirvetuximab soravtansine",
    "Datroway", "BNT323/DB-1303", "BNT323", "B-1303", "ZL-1310", "IMGN853", "ABBV-181", "MCLA-128", "SYD985", "BTK-ADC", "STRO-002", "ENHERTU (DS-8201)",
    "DS-8201", "ENHERTU", "YTX-7739", "Q2W-ADC"]

In [14]:
drug_names = ["Kadcyla", "Adcetris", "Polivy", "Enhertu", "Trodelvy"]

In [15]:
def safe_request(url, params, retries=5, delay=3):
    for i in range(retries):
        try:
            response = requests.get(url, params=params, timeout=30)
            if not response or response.status_code != 200:
                #print(f"Request failed with status code {response.status_code} for batch {params.get('skip', '?')}")
                continue
            return response
            
        except requests.exceptions.ReadTimeout:
            #print(f"Timeout on batch {params.get('skip', '?')} for drug {params['search']}. Retrying ({i+1}/{retries})...")
            time.sleep(delay)
        except requests.exceptions.RequestException as e:
            #print(f"Request failed: {e}")
            break
    return None  # All retries failed

In [16]:
all_records = []

In [None]:
for drug_name in tqdm(drug_names):
    for skip in range(0, total_records, batch_size):
        params = {"search": f"patient.drug.medicinalproduct:{drug_name}", "limit": batch_size, "skip": skip}
        response = safe_request(url, params)
        
        if not response:
            continue    # Skip failed attempt

        # Debugging: Check if the response has data
        #print(f"Fetched {len(response.json().get('results', []))} records for drug {drug_name}, skip {skip}.")

        data = response.json(); results = data.get("results", [])
        
        if not results:
            print(f"No data returned for batch {skip}.")
            continue    # Skip if no results
        
        for result in results:
            
            seriousness = {
                "Death":           int(result.get("seriousnessdeath",           0)),
                "Hospitalization": int(result.get("seriousnesshospitalization", 0)),
                "LifeThreatening": int(result.get("seriousnesslifethreatening", 0)),
                "Other":           int(result.get("seriousnessother",           0))}
            
            for reaction in result.get("patient", {}).get("reaction", []):
                record = {"Drug_Name": drug_name, "Adverse_Event": reaction.get("reactionmeddrapt", "Unknown"), **seriousness}
                all_records.append(record)

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(all_records)
print(len(df)); df.sample(3)

In [None]:
df['Drug_Name']     = df['Drug_Name'].str.title()
df['Adverse_Event'] = df['Adverse_Event'].str.title()
print(len(df)); df.sample(3)

In [None]:
drug_names_list = df['Drug_Name'].unique().tolist()

list2_set = set(drug_names_list)
diff = [item for item in drug_names if item not in list2_set]

#print(len(drug_names)); print('\033[32m', sorted(drug_names), '\033[0m')
#print(len(diff)); print('\033[32m', sorted(diff), '\033[0m')

drug_names_word_list1  =  ", ".join(sorted(drug_names));       print('\n\033[32m', drug_names_word_list1, '\033[0m')
drug_names_word_list2  =  ", ".join(sorted(drug_names_list));  print('\n\033[32m', drug_names_word_list2, '\033[0m')
diff_word_list         =  ", ".join(sorted(diff));             print('\n\033[32m', diff_word_list,        '\033[0m\n')

In [None]:
adverse_event_list = df['Adverse_Event'].unique().tolist()

print(len(adverse_event_list))
print('\n\033[32m', sorted(adverse_event_list[:10]), '\033[0m\n')

In [None]:
with open("./data/adverse_event_list.txt", "w") as f:
    for item in adverse_event_list:
        f.write(f"{item}\n")

In [None]:
# Aggregate total counts for each seriousness flag by Adverse Event and ADC Name
agg = df.groupby(["Drug_Name", "Adverse_Event"]).sum(numeric_only=True)
agg["Total"] = agg.sum(axis=1)      # Compute the 'Total' column across all seriousness flags for each Adverse Event and ADC
agg_reset = agg.reset_index()       # Reset index to flatten the DataFrame (remove multi-level index)
agg_reset = agg_reset.sort_values(by='Total', ascending=False)
print(len(agg_reset));agg_reset.head(5)

In [None]:
#agg_reset = agg_reset[agg_reset['Adverse_Event'].isin(adv_event_filter)]
print(len(agg_reset));agg_reset.head(5)

In [None]:
# Normalizes the Death, Hospitalization, and LifeThreatening columns within each drug (ADC Name), ensuring their values sum to 100%.
# It groups the data by drug, calculates the total for each seriousness column, and then divides each value by its respective total, multiplying by 100 to get percentages.
# It then drops the temporary total column and fills any NaN values.

# Step 1: Group by 'ADC Name' and sum each seriousness column
agg_reset['Total_Seriousness'] = agg_reset.groupby('Drug_Name')[['Death', 'Hospitalization', 'LifeThreatening']].transform('sum').sum(axis=1)

# Step 2: Normalize each column by dividing by the sum of that column for each drug
for col in ['Death', 'Hospitalization', 'LifeThreatening', 'Other', 'Total']:
    agg_reset[f'%{col}'] = agg_reset[col] * 100 / agg_reset.groupby('Drug_Name')[col].transform('sum')

agg_reset = agg_reset.drop(columns=['Total_Seriousness'])   # Step 3: Drop the temporary 'Total Seriousness' column

agg_reset = agg_reset.fillna(0)    # Fill NaN values (if any)

print(len(agg_reset)); agg_reset.head(5)

In [None]:
agg_reset.to_csv("./data/drug_AEs.csv", index=False)
#agg_reset.to_csv("./data/standard_of_care.csv", index=False)

In [None]:
# Path to your audio file
audio_file = "./data/clock-chime-88027.mp3"
display(Audio(audio_file, autoplay=True))

In [None]:
elapsed_time = time.time() - start
minutes, seconds = divmod(elapsed_time, 60)
print("'00_openFDA_UMAP_v01' script run time:", f"{int(minutes)} min {int(seconds)} sec.")