In [1]:
# Alberto Bejarano (2025)
# 01_openFDA_UMAP_v01
# pip install nbqa flake8
# nbqa flake8 openFDA_UMAP_v06.ipynb --select=F401

In [2]:
# These Jupyter notebooks process FDA adverse event data for a set of drugs. It first retrieves data using an API and extracts relevant details about the types of adverse events
# for each drug. The data is then aggregated, grouped by drug and event, and pivoted into a table format for further analysis. The notebook applies data scaling and dimensionality
# reduction techniques, such as Principal Component Analysis (PCA), to standardize and reduce the dimensionality of the dataset. It uses clustering (Leiden algorithm) to group 
# similar data points and explores the relationships between the drugs and their adverse events. Finally, the notebook visualizes the results using UMAP, providing insights into 
# the patterns and associations between the drugs and the severity of their associated adverse events.

In [3]:
# 01_openFDA_UMAP_v01
# This notebook processes FDA adverse event data by merging standard-of-care and drug-specific reports, focusing on serious outcomes like LifeThreatening events.
# It transforms the data into a drug-by-event matrix and applies PCA for dimensionality reduction, followed by Leiden clustering to identify groups of drugs with similar adverse event profiles. 
# UMAP is then computed across a grid of `min_dist` and `spread` values to visualize the structure of the data, and drug labels are overlaid on the final UMAP for interpretation.
# The visualization highlights distinct AE patterns across drugs, providing insights into safety-related clustering.

In [4]:
#import os   # Record the name of the active Conda environment
#env_name = os.getenv('CONDA_DEFAULT_ENV'); print(f"Active Conda environment: {env_name}")
#!conda env export > {env_name}.yml

In [5]:
import time; start = time.time()
from datetime import datetime; print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

April 21, 2025 19:05:33


In [6]:
import warnings
from anndata._core.aligned_df import ImplicitModificationWarning
warnings.simplefilter("ignore", ImplicitModificationWarning)

In [7]:
import requests, time, matplotlib, contextlib, os
from IPython.display import Audio, display
#         .         .         .         .         .         .         .         .
from adjustText import adjust_text
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import product
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns  = 200

In [8]:
import scanpy as sc; sc.settings.seed = 42

In [9]:
# 'Death', 'Hospitalization', 'LifeThreatening',' Other', 'Total', '%Death', '%Hospitalization', '%LifeThreatening',' %Other', '%Total']]
seriousness = 'LifeThreatening'

In [10]:
resolution   = 1          # Running PCA and Leiden
n_iterations = 5

In [11]:
SoC_AEs_df = pd.read_csv("./data/standard_of_care.csv")
SoC_AEs_df["data_source"]  = "SoC_AEs"
SoC_AEs_df.sample(3)

Unnamed: 0,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total,data_source
79500,5-Fu,Talipes,0,0,0,2,2,0.0,0.0,0.0,0.01,0.0,SoC_AEs
106437,5-Fu,Caudal Regression Syndrome,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,SoC_AEs
94416,Enfortumab Vedotin,Sticky Skin,0,0,0,1,1,0.0,0.0,0.0,0.0,0.0,SoC_AEs


In [12]:
drug_AEs_df = pd.read_csv("./data/drug_AEs.csv")
drug_AEs_df["data_source"] = "drug_AEs"
drug_AEs_df.sample(3)

Unnamed: 0,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total,data_source
2942,Adcetris,Nasopharyngitis,2,2,3,3,10,0.02,0.01,0.02,0.02,0.02,drug_AEs
3714,Kadcyla,Suspected Transmission Of An Infectious Agent ...,2,2,2,1,7,0.01,0.01,0.01,0.0,0.01,drug_AEs
2604,Kadcyla,Localised Infection,3,4,2,5,14,0.02,0.02,0.01,0.02,0.02,drug_AEs


In [13]:
df_concat = pd.concat([SoC_AEs_df, drug_AEs_df], ignore_index=True)
df_concat.sample(3)

Unnamed: 0,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total,data_source
27209,5-Fu,Delusion,1,6,1,8,16,0.02,0.04,0.04,0.04,0.04,SoC_AEs
90951,Dabrafenib,Infected Neoplasm,1,0,0,0,1,0.01,0.0,0.0,0.0,0.0,SoC_AEs
116382,Polivy,Tongue Dry,0,1,0,0,1,0.0,0.01,0.0,0.0,0.0,drug_AEs


In [14]:
drug_names_list = df_concat['Drug_Name'].unique().tolist()
drug_names_word_list =  ", ".join(sorted(drug_names_list));  
print("\n",len(drug_names_list))
print('\n\033[32m', drug_names_word_list, '\033[0m\n')


 52

[32m 5-Fu, Abemaciclib, Abiraterone, Adcetris, Alectinib, Anastrozole, Apalutamide, Atezolizumab, Bevacizumab, Cabazitaxel, Cabozantinib, Capecitabine, Carboplatin, Cetuximab, Cisplatin, Dabrafenib, Docetaxel, Durvalumab, Enfortumab Vedotin, Enhertu, Enzalutamide, Etoposide, Exemestane, Fulvestrant, Gemcitabine, Goserelin, Irinotecan, Kadcyla, Lenvatinib, Letrozole, Leucovorin, Levothyroxine, Lorlatinib, Nivolumab, Osimertinib, Paclitaxel, Palbociclib, Panitumumab, Pembrolizumab, Pertuzumab, Polivy, Regorafenib, Ribociclib, Sorafenib, T-Dm1, T-Dxd, Tamoxifen, Trametinib, Trastuzumab, Tremelimumab, Trodelvy, Vandetanib [0m



In [15]:
df_concat_2 = df_concat[['data_source', 'Drug_Name', 'Adverse_Event', 'Death', 'Hospitalization', 'LifeThreatening', 'Other', 'Total',
                         '%Death', '%Hospitalization', '%LifeThreatening', '%Other', '%Total']]
df_concat_2.head(3)

Unnamed: 0,data_source,Drug_Name,Adverse_Event,Death,Hospitalization,LifeThreatening,Other,Total,%Death,%Hospitalization,%LifeThreatening,%Other,%Total
0,SoC_AEs,Pertuzumab,Diarrhoea,1750,1828,1820,1753,7151,4.06,4.05,4.01,4.03,4.04
1,SoC_AEs,Pertuzumab,Off Label Use,1648,1603,1833,1404,6488,3.82,3.55,4.03,3.23,3.66
2,SoC_AEs,Palbociclib,Neutropenia,1562,1530,1499,1710,6301,3.55,3.39,3.37,4.16,3.61


In [16]:
pivot_df = df_concat_2.pivot_table(index=['data_source', 'Drug_Name'], columns='Adverse_Event', values=seriousness, aggfunc='sum', fill_value=0)
pivot_df = pivot_df.reset_index()
pivot_df.index.name = None
pivot_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in pivot_df.columns]
print(len(pivot_df)); pivot_df.head(3)

53


Unnamed: 0,data_source,Drug_Name,5-Hydroxyindolacetic Acid In Urine,5-Hydroxyindolacetic Acid Increased,5Q Minus Syndrome,Abasia,Abdomen Crushing,Abdominal Abscess,Abdominal Adhesions,Abdominal Bruit,Abdominal Cavity Drainage,Abdominal Compartment Syndrome,Abdominal Discomfort,Abdominal Distension,Abdominal Fat Apron,Abdominal Hernia,Abdominal Hernia Perforation,Abdominal Hernia Repair,Abdominal Incarcerated Hernia,Abdominal Infection,Abdominal Injury,Abdominal Lymphadenopathy,Abdominal Mass,Abdominal Neoplasm,Abdominal Operation,Abdominal Pain,Abdominal Pain Lower,Abdominal Pain Upper,Abdominal Rebound Tenderness,Abdominal Rigidity,Abdominal Sepsis,Abdominal Strangulated Hernia,Abdominal Symptom,Abdominal Tenderness,Abdominal Wall Abscess,Abdominal Wall Cyst,Abdominal Wall Disorder,Abdominal Wall Haematoma,Abdominal Wall Haemorrhage,Abdominal Wall Infection,Abdominal Wall Mass,Abdominal Wall Neoplasm Malignant,Abdominal Wall Oedema,Abdominal Wall Pain,Abdominal Wall Sinus,Abdominal Wall Wound,Abdominal Wound Dehiscence,Aberrant Aortic Arch,Abnormal Behaviour,Abnormal Clotting Factor,Abnormal Dreams,Abnormal Faeces,Abnormal Labour,Abnormal Loss Of Weight,Abnormal Menstrual Clots,Abnormal Sensation In Eye,Abnormal Sleep-Related Event,Abnormal Uterine Bleeding,Abnormal Weight Gain,Abnormal Withdrawal Bleeding,Abortion,Abortion Incomplete,Abortion Induced,Abortion Induced Complete,Abortion Missed,Abortion Spontaneous,Abortion Spontaneous Incomplete,Abscess,Abscess Bacterial,Abscess Drainage,Abscess Fungal,Abscess Intestinal,Abscess Jaw,Abscess Limb,Abscess Neck,Abscess Oral,Abscess Rupture,Abscess Soft Tissue,Abscess Sterile,Absent Bowel Movement,Abulia,Acalculia,Acanthamoeba Infection,Acantholysis,Acanthosis,Acanthosis Nigricans,Acarodermatitis,Accelerated Hypertension,Accelerated Idioventricular Rhythm,Accessory Spleen,Accident,Accident At Home,Accident At Work,Accidental Death,Accidental Exposure To Product,Accidental Exposure To Product By Child,Accidental Exposure To Product Packaging,Accidental Exposure To Product Packaging By Child,Accidental Overdose,Accidental Poisoning,...,Vulvovaginal Pruritus,Vulvovaginal Rash,Vulvovaginal Swelling,Vulvovaginitis,Waist Circumference Decreased,Waist Circumference Increased,Waldenstrom^S Macroglobulinaemia,Walking Aid User,Walking Disability,Walking Distance Test Abnormal,Wall Motion Score Index Abnormal,Walled-Off Pancreatic Necrosis,Wallerian Degeneration,Warm Type Haemolytic Anaemia,Water Intoxication,Weaning Failure,Weight Abnormal,Weight Bearing Difficulty,Weight Control,Weight Decreased,Weight Fluctuation,Weight Gain Poor,Weight Increased,Weight Loss Poor,Welander Distal Myopathy,Werner^S Syndrome,Wernicke^S Encephalopathy,West Nile Viral Infection,West Nile Virus Test Positive,Wheelchair User,Wheezing,White Blood Cell Analysis Abnormal,White Blood Cell Analysis Decreased,White Blood Cell Count,White Blood Cell Count Abnormal,White Blood Cell Count Decreased,White Blood Cell Count Increased,White Blood Cell Disorder,White Blood Cells Urine,White Blood Cells Urine Positive,White Coat Hypertension,White Matter Lesion,Wisdom Teeth Removal,Withdrawal Hypertension,Withdrawal Of Life Support,Withdrawal Syndrome,Wolff-Parkinson-White Syndrome,Wound,Wound Abscess,Wound Complication,Wound Decomposition,Wound Dehiscence,Wound Drainage,Wound Evisceration,Wound Haematoma,Wound Haemorrhage,Wound Infection,Wound Infection Bacterial,Wound Infection Fungal,Wound Infection Pseudomonas,Wound Infection Staphylococcal,Wound Necrosis,Wound Secretion,Wound Sepsis,Wound Treatment,Wrist Deformity,Wrist Fracture,Wrist Surgery,Wrong Device Used,Wrong Dosage Form,Wrong Dosage Formulation,Wrong Dose,Wrong Drug Administered,Wrong Patient,Wrong Patient Received Medication,Wrong Patient Received Product,Wrong Product Administered,Wrong Schedule,Wrong Strength,Wrong Technique In Device Usage Process,Wrong Technique In Drug Usage Process,Wrong Technique In Product Usage Process,X-Linked Chromosomal Disorder,X-Ray,X-Ray Abnormal,X-Ray Gastrointestinal Tract Abnormal,X-Ray Hepatobiliary Abnormal,Xanthelasma,Xanthogranuloma,Xanthoma,Xanthopsia,Xeroderma,Xerophthalmia,Xerosis,Yawning,Yellow Skin,Young Mania Rating Scale,Zinc Deficiency,Zoonosis,Zygomycosis
0,SoC_AEs,5-Fu,0,0,0,1,0,2,1,0,0,0,2,2,0,0,0,0,0,0,0,0,1,0,0,6,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,6,0,0,0,0,0,0,0,5,0,0,0,0,15,2,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,SoC_AEs,Abemaciclib,0,0,0,0,0,0,0,0,0,0,96,100,0,0,0,0,0,1,0,4,0,0,0,275,6,124,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,0,4,52,0,0,0,0,0,0,0,8,0,0,0,6,230,6,0,0,0,0,0,0,0,0,2,0,8,0,0,0,2,0,0,0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0
2,SoC_AEs,Abiraterone,0,0,0,0,0,4,0,0,0,0,46,31,2,0,0,0,0,2,0,4,0,0,0,72,6,46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,4,3,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,0,0,2,0,0,0,4,0,...,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,96,2,0,44,0,0,0,0,0,0,6,2,0,0,0,0,18,8,0,0,0,0,0,0,0,0,2,0,10,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,2,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0


In [17]:
# Drop 'Drug_Name' for scaling and keep it for later
drug_names = pivot_df[['data_source', 'Drug_Name']]
drug_names.sample(3)

Unnamed: 0,data_source,Drug_Name
12,SoC_AEs,Cetuximab
16,SoC_AEs,Durvalumab
5,SoC_AEs,Apalutamide


In [18]:
pivot_df2 = pivot_df.drop(columns=['data_source', 'Drug_Name'])
print(len(pivot_df2)); pivot_df2.head(3)

53


Unnamed: 0,5-Hydroxyindolacetic Acid In Urine,5-Hydroxyindolacetic Acid Increased,5Q Minus Syndrome,Abasia,Abdomen Crushing,Abdominal Abscess,Abdominal Adhesions,Abdominal Bruit,Abdominal Cavity Drainage,Abdominal Compartment Syndrome,Abdominal Discomfort,Abdominal Distension,Abdominal Fat Apron,Abdominal Hernia,Abdominal Hernia Perforation,Abdominal Hernia Repair,Abdominal Incarcerated Hernia,Abdominal Infection,Abdominal Injury,Abdominal Lymphadenopathy,Abdominal Mass,Abdominal Neoplasm,Abdominal Operation,Abdominal Pain,Abdominal Pain Lower,Abdominal Pain Upper,Abdominal Rebound Tenderness,Abdominal Rigidity,Abdominal Sepsis,Abdominal Strangulated Hernia,Abdominal Symptom,Abdominal Tenderness,Abdominal Wall Abscess,Abdominal Wall Cyst,Abdominal Wall Disorder,Abdominal Wall Haematoma,Abdominal Wall Haemorrhage,Abdominal Wall Infection,Abdominal Wall Mass,Abdominal Wall Neoplasm Malignant,Abdominal Wall Oedema,Abdominal Wall Pain,Abdominal Wall Sinus,Abdominal Wall Wound,Abdominal Wound Dehiscence,Aberrant Aortic Arch,Abnormal Behaviour,Abnormal Clotting Factor,Abnormal Dreams,Abnormal Faeces,Abnormal Labour,Abnormal Loss Of Weight,Abnormal Menstrual Clots,Abnormal Sensation In Eye,Abnormal Sleep-Related Event,Abnormal Uterine Bleeding,Abnormal Weight Gain,Abnormal Withdrawal Bleeding,Abortion,Abortion Incomplete,Abortion Induced,Abortion Induced Complete,Abortion Missed,Abortion Spontaneous,Abortion Spontaneous Incomplete,Abscess,Abscess Bacterial,Abscess Drainage,Abscess Fungal,Abscess Intestinal,Abscess Jaw,Abscess Limb,Abscess Neck,Abscess Oral,Abscess Rupture,Abscess Soft Tissue,Abscess Sterile,Absent Bowel Movement,Abulia,Acalculia,Acanthamoeba Infection,Acantholysis,Acanthosis,Acanthosis Nigricans,Acarodermatitis,Accelerated Hypertension,Accelerated Idioventricular Rhythm,Accessory Spleen,Accident,Accident At Home,Accident At Work,Accidental Death,Accidental Exposure To Product,Accidental Exposure To Product By Child,Accidental Exposure To Product Packaging,Accidental Exposure To Product Packaging By Child,Accidental Overdose,Accidental Poisoning,Accidental Underdose,Accommodation Disorder,...,Vulvovaginal Pruritus,Vulvovaginal Rash,Vulvovaginal Swelling,Vulvovaginitis,Waist Circumference Decreased,Waist Circumference Increased,Waldenstrom^S Macroglobulinaemia,Walking Aid User,Walking Disability,Walking Distance Test Abnormal,Wall Motion Score Index Abnormal,Walled-Off Pancreatic Necrosis,Wallerian Degeneration,Warm Type Haemolytic Anaemia,Water Intoxication,Weaning Failure,Weight Abnormal,Weight Bearing Difficulty,Weight Control,Weight Decreased,Weight Fluctuation,Weight Gain Poor,Weight Increased,Weight Loss Poor,Welander Distal Myopathy,Werner^S Syndrome,Wernicke^S Encephalopathy,West Nile Viral Infection,West Nile Virus Test Positive,Wheelchair User,Wheezing,White Blood Cell Analysis Abnormal,White Blood Cell Analysis Decreased,White Blood Cell Count,White Blood Cell Count Abnormal,White Blood Cell Count Decreased,White Blood Cell Count Increased,White Blood Cell Disorder,White Blood Cells Urine,White Blood Cells Urine Positive,White Coat Hypertension,White Matter Lesion,Wisdom Teeth Removal,Withdrawal Hypertension,Withdrawal Of Life Support,Withdrawal Syndrome,Wolff-Parkinson-White Syndrome,Wound,Wound Abscess,Wound Complication,Wound Decomposition,Wound Dehiscence,Wound Drainage,Wound Evisceration,Wound Haematoma,Wound Haemorrhage,Wound Infection,Wound Infection Bacterial,Wound Infection Fungal,Wound Infection Pseudomonas,Wound Infection Staphylococcal,Wound Necrosis,Wound Secretion,Wound Sepsis,Wound Treatment,Wrist Deformity,Wrist Fracture,Wrist Surgery,Wrong Device Used,Wrong Dosage Form,Wrong Dosage Formulation,Wrong Dose,Wrong Drug Administered,Wrong Patient,Wrong Patient Received Medication,Wrong Patient Received Product,Wrong Product Administered,Wrong Schedule,Wrong Strength,Wrong Technique In Device Usage Process,Wrong Technique In Drug Usage Process,Wrong Technique In Product Usage Process,X-Linked Chromosomal Disorder,X-Ray,X-Ray Abnormal,X-Ray Gastrointestinal Tract Abnormal,X-Ray Hepatobiliary Abnormal,Xanthelasma,Xanthogranuloma,Xanthoma,Xanthopsia,Xeroderma,Xerophthalmia,Xerosis,Yawning,Yellow Skin,Young Mania Rating Scale,Zinc Deficiency,Zoonosis,Zygomycosis
0,0,0,0,1,0,2,1,0,0,0,2,2,0,0,0,0,0,0,0,0,1,0,0,6,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,6,0,0,0,0,0,0,0,5,0,0,0,0,15,2,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,96,100,0,0,0,0,0,1,0,4,0,0,0,275,6,124,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,0,4,52,0,0,0,0,0,0,0,8,0,0,0,6,230,6,0,0,0,0,0,0,0,0,2,0,8,0,0,0,2,0,0,0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0
2,0,0,0,0,0,4,0,0,0,0,46,31,2,0,0,0,0,2,0,4,0,0,0,72,6,46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,4,3,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,0,0,2,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,96,2,0,44,0,0,0,0,0,0,6,2,0,0,0,0,18,8,0,0,0,0,0,0,0,0,2,0,10,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,2,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0


In [19]:
# Keep a numeric-only version
df_numeric       = pivot_df2.select_dtypes(include='number').copy()
df_numeric       = df_numeric.apply(pd.to_numeric, errors='coerce', downcast='float')
pivot_df.index   = pivot_df2.index.astype(str)
df_numeric.index = df_numeric.index.astype(str)
print(df_numeric.shape); print(df_numeric.max().max())

(53, 11286)
1833.0


In [20]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    with open(os.devnull, "w") as fnull, contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
        from anndata._core.aligned_df import ImplicitModificationWarning
        warnings.simplefilter("ignore", ImplicitModificationWarning)

        df_numeric = pivot_df2.select_dtypes(include='number').copy()     # Flatten MultiIndex if present and reset index
        if isinstance(df_numeric.index, pd.MultiIndex):
            df_numeric = df_numeric.reset_index(drop=True)

        pivot_df = pivot_df.reset_index(drop=True)                        # Reset index on pivot_df to ensure alignment

        adata_temp = sc.AnnData(df_numeric)                               # Create AnnData object
        adata_temp.obs['Drug_Name'] = pivot_df['Drug_Name'].values
        
        sc.pp.pca(adata_temp); sc.pp.neighbors(adata_temp)
        sc.tl.leiden(adata_temp, resolution=resolution, key_added= f"{resolution}", flavor="igraph", n_iterations=n_iterations, directed=False)

CPU times: total: 26.9 s
Wall time: 27.4 s


In [21]:
# Ensure that 'ADC_Name' is correctly mapped to categorical values (for discrete colors)
adata_temp.obs['Drug_Name'] = adata_temp.obs['Drug_Name'].astype('category')

In [22]:
# Define UMAP parameters for exploration
MIN_DISTS = [0.1, 1, 10, 100, 1E3, 1E4, 1E5, 1E6, 1E7]  # Adjusted values for min_dist
SPREADS   = [0.1, 1, 10, 100, 1E3, 1E4, 1E5, 1E6, 1E7]  # Adjusted values for spread

In [23]:
drug_names   = adata_temp.obs["Drug_Name"]
unique_drugs = drug_names.unique()                                                           # Generate a palette with enough unique colors
cmap_tab20b = list(matplotlib.colormaps.get_cmap("tab20b").colors)                           # Get colors from tab20b and tab20c
cmap_tab20c = list(matplotlib.colormaps.get_cmap("tab20c").colors)[::-1]                     # Reverse tab20c for variety

palette = [color for pair in zip(cmap_tab20b, cmap_tab20c) for color in pair]                # Intercalate the colors from each palette
palette = palette * (len(unique_drugs) // len(palette)) + palette[:len(unique_drugs) % len(palette)]

color_dict_drug = {name: palette[i] for i, name in enumerate(unique_drugs)}                  # Map drugs to colors

In [None]:
%%time
#'''
fig, axes = plt.subplots(len(MIN_DISTS), len(SPREADS), figsize=(12, 12))    # Create a figure for UMAP plots (subplots)
#          .         .          .          .          .         .          .          .          .         .          .          .
# Loop through all combinations of MIN_DIST and SPREAD
for (i, min_dist), (j, spread) in tqdm(product(enumerate(MIN_DISTS), enumerate(SPREADS)), total=len(MIN_DISTS) * len(SPREADS)):
    
    ax = axes[i][j]                                          # Get the subplot axis

    with warnings.catch_warnings():                          # Recompute UMAP with new parameters, suppressing warnings/output
        warnings.simplefilter("ignore")
        with open(os.devnull, "w") as fnull, contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            sc.tl.umap(adata_temp, min_dist=min_dist, spread=spread)
    
    kws = {'s': 50, 'add_outline': True, 'outline_width': (0.2, 0.01), 'outline_color': ('#545955', 'white')}
    sc.pl.umap(adata_temp, title='', color='Drug_Name', palette=color_dict_drug, legend_loc='right margin', show=False, sort_order=False, **kws, ax=ax)

    if i == 0:                                                # Set title only for the top row
        ax.set_title(f"spread = {spread}", fontsize=8)
    else:
        ax.set_title("")                                      # Clear title if not top row

    if j == 0:                                                # Set ylabel only for the first column
        ax.set_ylabel(f"min_dist = {min_dist}", fontsize=8, rotation=90, labelpad=5)
    else:
        ax.set_ylabel("")                                     # Remove ylabel for other columns
        
    ax.set_xlabel("")
    ax.spines['top'].set_visible(False);   ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('gray');   ax.spines['bottom'].set_color('gray')
    
#          .         .          .          .          .         .          .          .          .         .          .          .
handles, labels = axes[-1][-1].get_legend_handles_labels()    # Get the legend from the last UMAP plot (e.g., the last subplot)
for ax in axes.flatten():                                     # Remove the legend from the original UMAP plot
    ax.legend_.remove()
#plt.tight_layout(rect=[0.01, 0.01, 0.01, 0.01])              # Leave space for Y-axis labels
#plt.savefig("UMAP_Exploration_Discrete_Colors_No_Legend.png", format="png", bbox_inches="tight", dpi=300)
plt.show(); plt.close()
#''';

In [None]:
fig_legend, ax_legend = plt.subplots(figsize=(6, 1))
handles_drug = [plt.Line2D([0], [0], marker='o', color='w', markersize=12, markeredgewidth=0.5, markeredgecolor="#c0c0c0", 
                           markerfacecolor=color_dict_drug[name], label=name) for name in unique_drugs]
ax_legend.legend(handles=handles_drug, title='', loc='center',  markerscale=1.8, fontsize=10, ncol=8, labelspacing=0.75,  handletextpad=.8, frameon=False)
ax_legend.set_axis_off()
#plt.savefig("./plots/Legend_plot.png", format="png", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
# Set your chosen parameters
min_dist = 1E3     # Example, replace with your final choice
spread   = 1E2     # Example, replace with your final choice

with warnings.catch_warnings():    # Recompute UMAP with final parameters
    warnings.simplefilter("ignore")
    with open(os.devnull, "w") as fnull, contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
        sc.tl.umap(adata_temp, min_dist=min_dist, spread=spread)

In [None]:
# Get UMAP coordinates and labels
umap_coords = adata_temp.obsm["X_umap"]
drug_names  = adata_temp.obs["Drug_Name"]

# Plot scatter manually
cmap         = matplotlib.colormaps.get_cmap('tab20')
unique_drugs = drug_names.unique()
color_dict   = {name: cmap(i % cmap.N) for i, name in enumerate(unique_drugs)}
colors       = [color_dict[name] for name in drug_names]

In [None]:
adata_temp

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))  # Big plot

ax.scatter(umap_coords[:, 0], umap_coords[:, 1], c=colors, s=250, edgecolor='#383838', linewidth=0.5, zorder=2)

texts = [] # Add text labels
for x, y, label in zip(umap_coords[:, 0], umap_coords[:, 1], drug_names):
    label_split = label.split(' ')
    label_top = label_split[0]
    label_bottom = ' '.join(label_split[1:])
    text = ax.text(x - 0.01, y + 0.01, f"{label_top}\n{label_bottom}", fontsize=10, ha='center', va='center', zorder=3)
    texts.append(text)

adjust_text(texts, ax=ax, expand=(1.05, 1.05))            # Adjust text to avoid overlap (no arrows)

ax.set_xlabel("UMAP1")                                    # Remove axes labels and customize spines
ax.set_ylabel("UMAP2")
ax.set_title("UMAP with Drug Labels", fontsize=14)

ax.spines['top'].set_visible(False)                       # Customize spines
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('gray')
ax.spines['bottom'].set_color('gray')

plt.tight_layout()
plt.savefig("./plots/UMAPwithDrugLabels.png", format="png", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
df = adata_temp.obs[['Drug_Name', f'{resolution}']].copy()
df[['UMAP1', 'UMAP2']] = adata_temp.obsm['X_umap']
df.head()

In [None]:
# Path to your audio file
audio_file = "./data/clock-chime-88027.mp3"
display(Audio(audio_file, autoplay=True))

In [None]:
elapsed_time = time.time() - start
minutes, seconds = divmod(elapsed_time, 60)
print("'01_openFDA_UMAP_v01' script run time:", f"{int(minutes)} min {int(seconds)} sec.")