In [1]:
# Alberto Bejarano (2025)
# 00_openFDA_AES_Data_v06

In [2]:
# This script is designed to retrieve and process adverse event data from the FDA API for specified drugs. It operates
# in a modular manner, encapsulating functionality within distinct functions to enhance clarity and maintainability.
# Initially, the script loads an API key from a JSON file, which is used to authenticate and manage request limits
# with the FDA API. Next, it reads drug aliases from an Excel file to ensure consistent naming of drugs, accounting
# for multiple synonyms.
# For each drug specified in the drugs_of_interest list, the script fetches adverse event data in batches, handling
# potential errors such as rate limits using an exponential backoff strategy. The data is requested from the FDA API,
# focusing on serious adverse event categories like death, hospitalization, and life-threatening conditions.
# The fetched data is organized into a pandas DataFrame, which facilitates further data manipulation and analysis.
# Finally, the processed data for each drug is saved into individual CSV files, named dynamically to reflect the drug
# being analyzed. This ensures clear data organization and prevents overwriting of files. Throughout, the script
# employs logging and warnings for diagnostic purposes, helping users track the progress and identify any issues
# encountered during API interaction and data processing. The modular design makes it straightforward to adapt for
# other datasets or extend functionality based on evolving data requirements.

In [3]:
import warnings
class RequestWarning(UserWarning):   pass
class RetryWarning(UserWarning):     pass
class DrugFetchWarning(UserWarning): pass
class DataEndWarning(UserWarning):   pass

In [4]:
BATCH_SIZE     = 100                      # Number of records per batch during API requests
TOTAL_RECORDS  = 50000                    # Total number of records to fetch for each drug

In [5]:
import json, requests, time, os
import pandas as pd
import warnings
from tqdm import tqdm # Ensure tqdm is imported for progress bar

In [6]:
API_FDA_URL     = "https://api.fda.gov/drug/event.json"
JSON_FILE_PATH  = "./data/fda_api_key.json"
DRUG_DICT_FILE  = "./data/Drug_Dictionary.csv"

In [7]:
print(f"\nStandards Of Care")
INPUT_DRUG_LIST_FILE = './data/StandardsOfCare.txt'
AES_OUTPUT_FILE      = './data/AdverseEvents/AES_StandardsOfCare'
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
#print(f"\nDrugs Of Interest")
#INPUT_DRUG_LIST_FILE = './data2/DrugsofInterest.txt'
#AES_OUTPUT_FILE      = './data2/AdverseEvents/AES_DrugsofInterest'


Standards Of Care


In [8]:
def load_api_key(json_file_path):          # Load API key from a JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)
        api_key = data.get('api_key')
        print("API key loaded.")          # Confirmation message
        return api_key                    # Return the loaded API key

In [9]:
def check_file_exists(file_path):           # Check if a file exists.
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Required file {file_path} not found.")
    print(f"File {file_path} exists.")    # Confirmation message

In [10]:
def send_safe_request(BASE_URL, params, api_key=None, retries=3, initial_delay=1, backoff=2):
    """ Send API request to FDA with error handling and exponential backoff for managing rate limits. """
    delay = initial_delay                # Set initial delay for backoff strategy
    if api_key:
        params['api_key'] = api_key      # Include API key in request parameters if available
    for i in range(retries):
        try:
            response = requests.get(API_FDA_URL, params=params, timeout=30) # Send a GET request
            if response.status_code == 200:     # Successful request
                return response                 # Return successful response
            elif response.status_code == 429:   # Rate limit exceeded, retry with delay
                warnings.warn("Rate limit exceeded. Retrying...")
                time.sleep(delay * (backoff ** i))
            elif response.status_code in [400, 404]:   # Bad request or not found
                warnings.warn(f"Request failed with status {response.status_code}: {response.reason}")
                break
        except requests.exceptions.ReadTimeout:         # Handle request timeout
            warnings.warn("Read timeout occurred. Retrying...")
            time.sleep(delay * (backoff ** i))
        except requests.exceptions.RequestException as e:  # Handle other request exceptions
            warnings.warn(f"Request failed due to: {e}")
            break
    warnings.warn("All retries failed.")               # Warn if all retries fail
    return None                                         # Return None if request fails

In [11]:
def get_drug_aliases(drug_dict_file):
    """ Retrieve drug aliases from an Excel file and organize them into a dictionary. """
    dictionary_df = pd.read_csv(drug_dict_file, sep='\t')         # Load CSV file into a DataFrame
    alias_dict = {}                                     # Initialize dictionary for aliases
    for _, row in dictionary_df.iterrows():             # Iterate over each row in the DataFrame
        drug_name = row['Drug_Name'].strip().replace('\xa0', '').lower() # Get primary drug name, cleaned and formatted
        alias_dict[drug_name] = drug_name               # Map primary name to itself
        if pd.notna(row['Other_Drug_Names']):           # Check for any aliases
            aliases = [name.strip().replace('\xa0', '').lower() for name in row['Other_Drug_Names'].split(',')]
            for alias in aliases:
                alias_dict[alias] = drug_name           # Map each alias to the primary name
    return alias_dict                                   # Return the dictionary of aliases

In [12]:
def fetch_adverse_events(drug_name, api_key, alias_dict):
    """ Fetch adverse event data for the drug and return a DataFrame. """
    all_records = []                                     # List to store all fetched records

    # Request metadata to estimate total number of records for current drug
    meta_params = {"search": f"patient.drug.medicinalproduct:{drug_name}", "api_key": api_key, "limit": 1}
    meta_response = send_safe_request(API_FDA_URL, meta_params)                         # Make metadata request
    if not meta_response:                                                          # Handle failed metadata fetch
        warnings.warn(f"Metadata request failed for '{drug_name}'. Skipping.")
        return pd.DataFrame()
    try:
        TOTAL_RECORDS = meta_response.json()["meta"]["results"]["total"]           # Extract total number of records
    except (KeyError, TypeError) as e:                                              # Handle parsing issues
        warnings.warn(f"Metadata parsing failed for '{drug_name}': {e}. Skipping.")
        return pd.DataFrame()

    for skip in tqdm(range(0, TOTAL_RECORDS, BATCH_SIZE), desc=f"Fetching {drug_name} AES", unit="batch"):
        # Set parameter for search query including medicinal product name, batch size, and skip count for pagination
        params = {"search": f"patient.drug.medicinalproduct:{drug_name}", "limit": BATCH_SIZE, "skip": skip}
        response = send_safe_request(API_FDA_URL, params, api_key) # Fetch data with proper request handling
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        if not response:                                 # Handle failed response
            warnings.warn(f"Failed to fetch data for {drug_name}. Skipping.")
            break
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        data = response.json(); results = data.get("results", [])             # Parse JSON response for results
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        if not results:                                                       # No results returned
            warnings.warn(f"No data returned for batch {skip}.")
            break
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        # Process each result, adding seriousness data to records list
        for result in results:
            seriousness = {
                "Death": int(result.get("seriousnessdeath",                     0)),
                "Hospitalization": int(result.get("seriousnesshospitalization", 0)),
                "LifeThreatening": int(result.get("seriousnesslifethreatening", 0)),
                "Other": int(result.get("seriousnessother",                     0))}
            
            for reaction in result.get("patient", {}).get("reaction", []):                               # Retrieve and format reaction data
                record = {"Drug_Name": alias_dict.get(drug_name.lower(), drug_name),
                          "Adverse_Event": reaction.get("reactionmeddrapt", "Unknown"), **seriousness}    # Include seriousness information             
                all_records.append(record)                              # Append the processed record to all records
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        # End of results indicated by fewer than expected batch size
        if len(results) < BATCH_SIZE:
            warnings.warn(f"Reached the end of data for {drug_name}.")
            break

    return pd.DataFrame(all_records)                                    # Convert accumulated records to a DataFrame

In [13]:
def save_data_to_csv(df, drug_name):
    """ Save the DataFrame to a CSV file. """
    if not df.empty:                                                    # Ensure DataFrame has data
        columns_to_format = ['%Death', '%Hospitalization', '%LifeThreatening', '%Other', '%Total']
        decimals = 2                                                    # Set decimal precision
        # Format decimal places of specific columns
        for column in columns_to_format:
            if column in df.columns:
                df[column] = df[column].apply(lambda x: round(x, decimals))

        filename = f"{AES_OUTPUT_FILE}_{drug_name}.csv"                # Define filename for output
        df.to_csv(filename, index=False)                               # Save DataFrame to CSV file
        print(f"Data saved to {filename}")                             # Confirmation message
    else:
        print(f"No data available for {drug_name}.")                   # Notify if there is no data to save

In [14]:
def main():
    # Check if essential input files exist to prevent runtime errors
    check_file_exists(JSON_FILE_PATH)
    check_file_exists(DRUG_DICT_FILE)
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
    # Load API key and drug aliases for data retrieval
    api_key    = load_api_key(JSON_FILE_PATH)
    alias_dict = get_drug_aliases(DRUG_DICT_FILE)
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
    # Load drugs of interest from the specified text file for processing
    with open(INPUT_DRUG_LIST_FILE, 'r') as file:
        drugs_of_interest = [line.strip() for line in file.readlines()]
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
    # Process each drug independently
    for drug_name in drugs_of_interest:
        # Sanitize drug name for file name usage to eliminate problematic characters
        safe_drug_name = drug_name.replace(" ", "_").replace("/", "_").lower()
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        # Construct the filename using the sanitized drug name
        CSV_FILENAME = f"{AES_OUTPUT_FILE}_{safe_drug_name}.csv"
        print(f"\nCheck if file exists: {CSV_FILENAME}")
        if os.path.exists(CSV_FILENAME):                          # Check if output file already exists
            print(f"CSV file for {drug_name} already exists. Skipping..")
            continue
        print(f"Processing adverse events for {drug_name}...")
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        # Fetch data for both the original name and all aliases by iterating over alias list
        all_events_df = pd.DataFrame()
        for alias in [name for name, original in alias_dict.items() if original == drug_name.lower()]:
            temp_df = fetch_adverse_events(alias, api_key, alias_dict)
            all_events_df = pd.concat([all_events_df, temp_df], ignore_index=True) # Concatenate data for all aliases
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
        # If data is present, group and sort adverse events, save top 100 events
        if not all_events_df.empty:
            # Grouping by Drug and Event, summing numerical columns to aggregate seriousness
            grouped_events = all_events_df.groupby(['Drug_Name', 'Adverse_Event']).sum().reset_index()
            # Create a new column for total severity
            grouped_events['Total_Severity'] = grouped_events['Death']+grouped_events['Hospitalization']+ \
                                               grouped_events['LifeThreatening']+grouped_events['Other']
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .
            TXT_FILENAME = f"{AES_OUTPUT_FILE}_{safe_drug_name}.txt"
            adverse_event_list = all_events_df['Adverse_Event'].unique().tolist()
            with open(TXT_FILENAME, 'w', encoding='utf-8') as f:
                for event in adverse_event_list:
                    f.write(f"{event}\n")
                print(f"Data saved to {TXT_FILENAME}") 
#         .        .         .         .        .         .         .        .         .         .        .         .         .        .         .            
            sorted_events = grouped_events.sort_values(by='Total_Severity', ascending=False).head(100)
            save_data_to_csv(sorted_events, drug_name)                  # Save top events to CSV
        else:
            print(f"No data available for {drug_name}.")                # Inform no data is available

In [None]:
if __name__ == "__main__":
    start_time = time.time()                                            # Record start time
    main()                                                              # Execute main function
    elapsed_time = time.time() - start_time                             # Calculate elapsed time
    minutes, seconds = divmod(elapsed_time, 60)                         # Convert elapsed time to minutes and seconds
    print(f"Completed in {int(minutes)} min {int(seconds)} sec.")       # Print completion message with elapsed time

File ./data/fda_api_key.json exists.
File ./data/Drug_Dictionary.csv exists.
API key loaded.

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_fluorouracil.csv
CSV file for Fluorouracil already exists. Skipping..

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_tamoxifen.csv
CSV file for Tamoxifen already exists. Skipping..

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_letrozole.csv
CSV file for Letrozole already exists. Skipping..

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_anastrozole.csv
CSV file for Anastrozole already exists. Skipping..

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_exemestane.csv
CSV file for Exemestane already exists. Skipping..

Check if file exists: ./data/AdverseEvents/AES_StandardsOfCare_palbociclib.csv
Processing adverse events for Palbociclib...


Fetching palbociclib AES:  99%|█████████▉| 102/103 [03:13<00:01,  1.89s/batch]
Fetching ibrance AES:  31%|███       | 251/804 [06:55<15:16,  1.66s/batch]
Fetching pd 0332991 AES:  99%|█████████▉| 146/147 [03:36<00:01,  1.48s/batch]
Fetching pd 332991 AES:  99%|█████████▉| 146/147 [03:37<00:01,  1.49s/batch]
Fetching pd-0332991 AES:  37%|███▋      | 54/147 [01:17<02:34,  1.66s/batch]