In [1]:
# 00_openFDA_AEs_Data_v03
# Author: Alberto Bejarano (2025)
# This script retrieves adverse event data from the FDA Open API for a list of drugs

In [2]:
# This script queries the FDA Adverse Event Reporting System (FAERS) API to retrieve and analyze adverse events associated
# with a curated list of drugs, including synonyms defined in a dictionary. It handles batching, safe retries, and structured
# parsing of seriousness outcomes (e.g., death, hospitalization). The results are normalized, aggregated by drug and adverse
# event, and saved to a CSV file for downstream analysis. The script also supports testing via hardcoded drug inputs, outputs
# summary statistics, and optionally plays a notification chime upon completion.

In [3]:
# With no API key: 240 requests per minute, per IP address.  1,000 requests per day, per IP address.
# With an API key: 240 requests per minute, per key.       120,000 requests per day, per key.

In [4]:
import warnings
import requests, os, time, json
from datetime import datetime
from IPython.display import Audio, display
from tqdm import tqdm
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = 200

In [5]:
start = time.time()   # Start timer
print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

May 28, 2025 22:15:59


In [6]:
api_fda_url   = "https://api.fda.gov/drug/event.json"
total_records = 50000
batch_size    = 100

In [7]:
input_drug_list_file_path  = "./data/drugs_of_interest.txt"
drug_dict_file_path        = "./data/drug_dictionary.csv"
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
output_drug_list_file_path = "./data/drugs_of_interest_extended.txt"
aes_output_file            = "./data/AEs_drugs_of_interest.csv"
#aes_output_file            = "./data/AEs_drugs_of_interest.csv"

In [8]:
def load_api_key(json_path="./data/fda_api_key.json"):
    with open(json_path) as f:
        data = json.load(f)
    return data["api_key"]
api_key = load_api_key()    

In [9]:
with open(input_drug_list_file_path, 'r') as file:                      # Read drugs of interest from text file
    drugs_of_interest = [line.strip() for line in file.readlines()]
#drugs_of_interest = ['Fluorouracil', 'Exemestane', 'Tamoxifen'] Just for testing
print(drugs_of_interest)
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
nested_list = []    # Load dictionary and create nested list
dictionary_df = pd.read_csv(drug_dict_file_path, sep='\t')
for index, row in dictionary_df.iterrows():
    drug_name = row['Drug_Name'].strip().replace('\xa0', "")
    other_names = []
    if pd.notna(row['Other_Drug_Names']):
        other_names = [name.strip().replace('\xa0', "") for name in row['Other_Drug_Names'].split(', ')]
    nested_list.append([drug_name] + other_names)
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
drugs_of_interest_extended = []    # Create a list to store all matched terms (flattened)
for drug in drugs_of_interest:     # Search for Drugs of Interest in each sublist and add all terms to matched_terms
    for sublist in nested_list:
        if drug in sublist:        # Adds the terms once if the drug is found
            drugs_of_interest_extended.extend(sublist)
            break
drugs_of_interest_extended = sorted(list(set(drugs_of_interest_extended)))

['Fluorouracil']


In [10]:
try:
    aes_df = pd.read_csv(aes_output_file)    # Replace with your actual file path
    drugs_to_remove = aes_df["Drug_Name"].dropna().unique().tolist()
    print(drugs_to_remove)
except FileNotFoundError:
    drugs_to_remove = []                     # File doesn't exist â€” set an empty list
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
nested_list = []    # Load dictionary and create nested list
dictionary_df = pd.read_csv(drug_dict_file_path, sep='\t')
for index, row in dictionary_df.iterrows():
    drug_name = row['Drug_Name'].strip().replace('\xa0', "")
    other_names = []
    if pd.notna(row['Other_Drug_Names']):
        other_names = [name.strip().replace('\xa0', "") for name in row['Other_Drug_Names'].split(', ')]
    nested_list.append([drug_name] + other_names)
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .    
drugs_to_remove_extended = []      # Create a list to store all matched terms (flattened)
for drug in drugs_to_remove:       # Search for Drugs of Interest in each sublist and add all terms to matched_terms
    for sublist in nested_list:
        if drug in sublist:        # Adds the terms once if the drug is found
            drugs_to_remove_extended.extend(sublist)
            break
drugs_to_remove_extended = sorted(list(set(drugs_to_remove_extended)))

['fluorouracil', 'tamoxifen']


In [11]:
filtered_list = [drug for drug in drugs_of_interest_extended if drug not in drugs_to_remove_extended]
#filtered_list = sorted(list(set(filtered_list)))

In [12]:
#drugs_of_interest_extended = ['5-FU', '5-Fu', 'S-fu', 'Fluorouracil', '5-fluorouracil']
drugs_of_interest_extended = ['Fluorouracil']

In [13]:
word_list1  =  ", ".join(sorted(drugs_of_interest));            len_list1 = len(drugs_of_interest)
word_list2  =  ", ".join(sorted(drugs_of_interest_extended));   len_list2 = len(drugs_of_interest_extended)
word_list3  =  ", ".join(sorted(drugs_to_remove));              len_list3 = len(drugs_to_remove)
word_list4  =  ", ".join(sorted(drugs_to_remove_extended));     len_list4 = len(drugs_to_remove_extended)
word_list5  =  ", ".join(sorted(filtered_list));                len_list5 = len(filtered_list)

In [14]:
print(f'\n\033[31m   {len_list1}  \033[0m \033[33m Original Drugs of Interest: \n\033[0m{word_list1} \n')  
print(f'\033[31m   {len_list2}  \033[0m \033[33m Extended Drugs of Interest: \n\033[0m{word_list2}\n') 
print(f'\033[31m   {len_list3}  \033[0m \033[33m Drugs in the Output File: \n\033[0m{word_list3}\n')
print(f'\033[31m   {len_list4}  \033[0m \033[33m Drugs in the Output File Extended: \n\033[0m{word_list4}\n')
print(f'\033[31m   {len_list5}  \033[0m \033[33m Filtered List: \n\033[0m{word_list5}\n')


[31m   1  [0m [33m Original Drugs of Interest: 
[0mFluorouracil 

[31m   1  [0m [33m Extended Drugs of Interest: 
[0mFluorouracil

[31m   2  [0m [33m Drugs in the Output File: 
[0mfluorouracil, tamoxifen

[31m   15  [0m [33m Drugs in the Output File Extended: 
[0m5 FU, 5 Fluorouracil, 5-FU, 5-Fluorouracil, Actikerall, Efudex, Fluoroplex, Fluorouracil, ICI 47699, ICI-47699, Soltamox, Tamoxifen, Tolak, fluorouracil, tamoxifen

[31m   0  [0m [33m Filtered List: 
[0m



In [15]:
with open(output_drug_list_file_path, 'w') as file:      # Write the matched terms to the output text file
    for term in filtered_list:
        file.write(f"{term}\n")

In [16]:
manual_cache = {}
def dict_to_key(d):
    return "&".join(f"{k}={v}" for k, v in sorted(d.items()))

In [17]:
def safe_request(api_fda_url, params, retries=3, delay=1, backoff=2):
    key = f"{api_fda_url}?{dict_to_key(params)}"
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
    if key in manual_cache:
        return manual_cache[key]  # Return cached response if available
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
    for i in range(retries):
        try:
            response = requests.get(api_fda_url, params=params, timeout=30)

            if response.status_code == 200:
                manual_cache[key] = response  # Cache successful response
                return response

            elif response.status_code in [400, 404, 429]:
                warnings.warn(f"Request failed with status {response.status_code}: {response.reason}")
                if response.status_code == 429:
                    time.sleep(delay * (backoff ** i))  # Retry after delay for rate limiting
                else:
                    break  # Do not retry on 400 or 404
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        except requests.exceptions.ReadTimeout:
            time.sleep(delay * (backoff ** i))  # Exponential backoff on timeout
        except requests.exceptions.RequestException as e:
            warnings.warn(f"Request failed due to: {e}")
            break  # Do not retry for other errors
    return None  # Return None if all retries fail

In [18]:
filtered_list = ['Fluorouracil', 'Soltamox'] # Just for testing
filtered_list = ['Soltamox'] # Just for testing

In [19]:
all_records = []           # Query FAERS API for each drug synonym
for drug_name in tqdm(filtered_list, leave=False):
    records_this_drug = 0  # Counter for current drug records

    # Request metadata to estimate total number of records for current drug
    meta_params = {"search": f"patient.drug.medicinalproduct:{drug_name}", "api_key": api_key, "limit":1}                                                                       

    meta_response = safe_request(api_fda_url, meta_params)                         # Make metadata request
    if not meta_response:
        warnings.warn(f"Metadata request failed for {drug_name}. Skipping.")       # Warn and skip if failed
        continue
    try:
        total_records = meta_response.json()["meta"]["results"]["total"]           # Extract total number of records
    except (KeyError, TypeError) as e:
        warnings.warn(f"Metadata parsing failed for {drug_name}: {e}. Skipping.")  # Warn and skip if missing
        continue

    consecutive_failures = 0  # Track number of failed batches in a row

    for skip in range(0, total_records, batch_size):
        params = {"search": f"patient.drug.medicinalproduct:{drug_name}",  # Drug-specific search
                  "api_key": api_key, "limit": batch_size, "skip": skip}   # API key / Batch size for pagination / Pagination offset
        
        if (skip // batch_size) % 100 == 0:
            print(f"Trying FDA API with search {drug_name} and skip {params['skip']}")  # Avoid printing API key
        response = safe_request(api_fda_url, params)
        if not response:
            consecutive_failures += 1
            if consecutive_failures >= 2:  # Adjust threshold for stopping if multiple failures
                print(f"Too many failed batches. Assuming end of data for {drug_name}.")
                break
            continue                                                       # Skip failed attempt                                       
        consecutive_failures = 0  # Reset if success

        results = response.json().get("results", [])
        if not results:
            print(f"No data returned for batch {skip}.")
            break                                                          # Stop if no results returned
        for result in results:
            seriousness = {
                "Death":           int(result.get("seriousnessdeath",           0)),
                "Hospitalization": int(result.get("seriousnesshospitalization", 0)),
                "LifeThreatening": int(result.get("seriousnesslifethreatening", 0)),
                "Other":           int(result.get("seriousnessother",           0))}
            for reaction in result.get("patient", {}).get("reaction", []):
                record = {"Drug_Name": drug_name, "Adverse_Event": reaction.get("reactionmeddrapt", "Unknown"), **seriousness}
                all_records.append(record)
                records_this_drug += 1                                      # Count record for this drug
        if len(results) < batch_size:
            print(f"Reached end of data for {drug_name}.")
            break                                                           # Stop pagination if last batch has fewer records than batch size
    print(f"Total records retrieved for drug '{drug_name}': {records_this_drug}")

  0%|          | 0/1 [00:00<?, ?it/s]

Trying FDA API with search Soltamox and skip 0


                                             

Reached end of data for Soltamox.
Total records retrieved for drug 'Soltamox': 360




In [20]:
print(f"\nNumber of records: {len(all_records)}")
df = pd.DataFrame(all_records)
df['Drug_Name']     = df['Drug_Name'].str.title()
df['Adverse_Event'] = df['Adverse_Event'].str.title()


Number of records: 360


In [21]:
nested_list = []                                                         # Normalize names
for index, row in dictionary_df.iterrows():
    drug_name = row['Drug_Name'].strip().replace('\xa0', "").lower()     # Normalize to lowercase
    if pd.notna(row['Other_Drug_Names']):
        other_names = [name.strip().replace('\xa0', "").lower() for name in row['Other_Drug_Names'].split(', ')]
    nested_list.append([drug_name] + other_names)

In [22]:
alias_to_primary = {}                           # Create a dictionary to map aliases to their primary drug names
for sublist in nested_list:                    
    primary = sublist[0]                        # The first item is the primary drug name
    for alias in sublist:
        alias_to_primary[alias] = primary

In [23]:
df['Drug_Name'] = df['Drug_Name'].str.lower()                                                # Normalize to lowercase before mapping
df['Drug_Name'] = df['Drug_Name'].apply(lambda x: alias_to_primary.get(x, x))                # Update Drug_Name column using mapping
drug_names_list = df['Drug_Name'].unique().tolist(); list2_set = set(drug_names_list)
print(f'Unique drug names after alias replacement: {list2_set}')

Unique drug names after alias replacement: {'tamoxifen'}


In [24]:
diff = [item for item in drugs_of_interest_extended if item not in list2_set]
# print(len(drug_names)); print('\033[32m', sorted(drug_names), '\033[0m')

In [25]:
names_wrd_list1  = ", ".join(sorted(drugs_of_interest_extended)); print('\n\033[32m', names_wrd_list1, '\033[0m')
names_wrd_list2  = ", ".join(sorted(drug_names_list));            print('\n\033[32m', names_wrd_list2, '\033[0m')
diff_wrd_list    = ", ".join(sorted(diff));                       print('\n\033[32m', diff_wrd_list,   '\033[0m')
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
adverse_event_list = df['Adverse_Event'].unique().tolist()
adverse_event_list = [ae.strip() for ae in adverse_event_list if ae and ae.strip()]
top10_aes          = sorted(adverse_event_list)[:10]
aes_wrd_list       = ", ".join(top10_aes)
print(f'\nNumber of different AES found: \033[31m{len(adverse_event_list)}\033[0m')
print('\n\033[32m', aes_wrd_list, '\033[0m\n')


[32m Fluorouracil [0m

[32m tamoxifen [0m

[32m Fluorouracil [0m

Number of different AES found: [31m129[0m

[32m Abdominal Pain Upper, Ageusia, Agitation, Agranulocytosis, Alanine Aminotransferase Increased, Alopecia, Anxiety, Arthralgia, Ascites, Aspartate Aminotransferase Increased [0m



In [26]:
with open("./data/adverse_event_list.txt", "w") as f:
    for item in adverse_event_list:
        f.write(f"{item}\n")

In [27]:
agg = df.groupby(["Drug_Name", "Adverse_Event"]).sum(numeric_only=True)   # Aggregate
agg["Total"] = agg.sum(axis=1)
agg_reset = agg.reset_index().sort_values(by="Total", ascending=False)
print(f"\nNumber of rows in aggregated table: {len(agg_reset)}")


Number of rows in aggregated table: 129


In [28]:
# Normalizes Death, Hospitalization, and LifeThreatening columns within each drug, ensuring their values sum to 100%.
# It groups the data by drug, calculates the total for each seriousness column, and then divides each value by its
# respective total, multiplying by 100 to get percentages. Drops the temporary total column and fills any NaN
# Step 1: Group by 'Drug_Name' and sum each seriousness column
agg_reset['Total_Seriousness'] = agg_reset.groupby('Drug_Name')[['Death', 'Hospitalization', 'LifeThreatening']].transform('sum').sum(axis=1)
# Step 2: Normalize each column by dividing by the sum of that column for each drug
for col in ['Death', 'Hospitalization', 'LifeThreatening', 'Other', 'Total']:
    agg_reset[f'%{col}'] = agg_reset[col] * 100 / agg_reset.groupby('Drug_Name')[col].transform('sum')
# Step 3: Drop the temporary 'Total_Seriousness' column
agg_reset = agg_reset.drop(columns=['Total_Seriousness'])
# Fill NaN values (if any)
agg_reset = agg_reset.fillna(0)
print(len(agg_reset)); agg_reset.head(5)

129


Unnamed: 0,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total
49,tamoxifen,Fatigue,11,12,15,11,49,2.84,3.03,2.39,3.01,2.76
12,tamoxifen,Balance Disorder,10,10,16,9,45,2.58,2.53,2.55,2.46,2.53
126,tamoxifen,Vomiting,8,10,14,10,42,2.06,2.53,2.23,2.73,2.36
18,tamoxifen,Blood Phosphorus Increased,7,7,14,7,35,1.8,1.77,2.23,1.91,1.97
10,tamoxifen,Asthenia,8,8,12,7,35,2.06,2.02,1.91,1.91,1.97


In [29]:
# Filter to retain only the most frequent adverse events per drug
df_sorted = agg_reset.sort_values(['Drug_Name', 'Total'], ascending=[True, False])
df_sorted["rank"] = df_sorted.groupby("Drug_Name").cumcount()
df_commonaes = df_sorted[df_sorted["rank"] < 100].drop(columns="rank").reset_index(drop=True)
print(len(df_commonaes)); df_commonaes.head(5)

100


Unnamed: 0,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total
0,tamoxifen,Fatigue,11,12,15,11,49,2.84,3.03,2.39,3.01,2.76
1,tamoxifen,Balance Disorder,10,10,16,9,45,2.58,2.53,2.55,2.46,2.53
2,tamoxifen,Vomiting,8,10,14,10,42,2.06,2.53,2.23,2.73,2.36
3,tamoxifen,Blood Phosphorus Increased,7,7,14,7,35,1.8,1.77,2.23,1.91,1.97
4,tamoxifen,Asthenia,8,8,12,7,35,2.06,2.02,1.91,1.91,1.97


In [30]:
drugs_commonaes_list = df_commonaes["Drug_Name"].dropna().unique().tolist()
word_list6  =  ", ".join(sorted(drugs_commonaes_list)); len_list6 = len(drugs_commonaes_list)
print(f'\033[33m Rows in the df:\t \033[31m {len(df_commonaes)} ')
print(f'\033[33m Filtered List lenght:\t \033[31m {len_list6}  \033[0m \n\033[0m{word_list6}')

[33m Rows in the df:	 [31m 100 
[33m Filtered List lenght:	 [31m 1  [0m 
[0mtamoxifen


In [31]:
# Specify the columns you want to format and the number of decimals
columns_to_format = ['%Death', '%Hospitalization', '%LifeThreatening', '%Other', '%Total']
for column in columns_to_format:   # Reduce the number of decimals in the specified columns
    agg_reset[column] = agg_reset[column].apply(lambda x: round(x, 2))

In [32]:
%%time
if os.path.exists(aes_output_file):
    existing = pd.read_csv(aes_output_file)
    updated  = pd.concat([existing, agg_reset], ignore_index=True)
    updated  = updated.drop_duplicates()  # Remove exact duplicate rows
    updated.to_csv(aes_output_file, index=False)
    print(f"\nUpdated AE data (duplicates removed) saved to: {aes_output_file}")
else:
    agg_reset.to_csv(aes_output_file, index=False)
    print(f"\nNew AE file created: {aes_output_file}")


Updated AE data (duplicates removed) saved to: ./data/AEs_drugs_of_interest.csv
CPU times: total: 109 ms
Wall time: 119 ms


In [33]:
audio_file = "./data/clock-chime-88027.mp3"
if os.path.exists(audio_file):
    display(Audio(audio_file, autoplay=True))

In [34]:
elapsed_time = time.time() - start
mins, secs = divmod(elapsed_time, 60)
print(f"\nScript completed in: {int(mins)} min {int(secs)} sec.")


Script completed in: 0 min 3 sec.
