In [1]:
# Alberto Bejarano [2025]
# FAERS_AdEvents_NLP_v001

In [2]:
# This code applies Natural Language Processing (NLP) and machine learning to categorize adverse medical events based on semantic similarity. Each event is embedded into a vector representation
# using a pre-trained DistilBERT model, capturing the underlying meaning of the text. The K-Means clustering algorithm (KMeans from scikit-learn) then groups these vectors into 10 clusters,
# revealing patterns of similar adverse events. A Pandas DataFrame is created to map each event to its assigned cluster, and the results are printed to display the events grouped into distinct categories.
# This process helps efficiently organize and analyze adverse events data for pattern recognition and deeper insights.

In [3]:
# !pip install sentence-transformers

In [4]:
import time; start = time.time()
from datetime import datetime; print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

April 14, 2025 22:39:19


In [5]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [7]:
# Sample list of adverse events
adverse_events =  ['Abasia', 'Abdominal Abscess', 'Abdominal Adhesions', 'Abdominal Compartment Syndrome', 'Abdominal Discomfort', 'Abdominal Distension', 
                   'Abdominal Incarcerated Hernia', 'Abdominal Infection', 'Abdominal Injury', 'Abdominal Lymphadenopathy', 'Abdominal Mass', 'Abdominal Neoplasm', 
                   'Abdominal Operation', 'Abdominal Pain', 'Abdominal Pain Lower', 'Abdominal Pain Upper', 'Abdominal Rigidity', 'Abdominal Sepsis', 'Abdominal Symptom',
                   'Abdominal Tenderness', 'Abdominal Wall Abscess', 'Abnormal Behaviour', 'Abnormal Dreams', 'Abnormal Faeces', 'Abnormal Loss Of Weight', 
                   'Abnormal Sensation In Eye', 'Abnormal Uterine Bleeding', 'Abnormal Weight Gain', 'Abortion', 'Abortion Induced', 'Abortion Spontaneous', 
                   'Abscess', 'Abscess Fungal', 'Abscess Intestinal', 'Abscess Limb', 'Abscess Neck', 'Abulia', 'Acanthamoeba Infection', 'Acarodermatitis',
                   'Accelerated Idioventricular Rhythm', 'Accident', 'Accidental Exposure', 'Accidental Exposure To Product', 'Accidental Overdose', 'Accommodation Disorder']

In [8]:
with open("./data/adverse_event_list.txt", "r") as f:
    adverse_events = [line.strip() for line in f]
print(len(adverse_events))
word_list = ", ".join(adverse_events[:10]); print(word_list)

7498
Disease Progression, Nausea, Keratopathy, Interstitial Lung Disease, Visual Acuity Reduced, Inappropriate Schedule Of Product Administration, Off Label Use, Diarrhoea, Death, Fatigue


In [9]:
# Load pre-trained model for embeddings
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [None]:
# For domain-specific medical text, pre-trained on clinical data
# ClinicalBERT, fine-tuned on clinical text (e.g., clinical trials, medical literature)
#model = SentenceTransformer('emilyalsentzer/Bio_ClinicalBERT')  # Clinical BERT for clinical trial data

# For biomedical texts, trained on PubMed and PMC articles
#model = SentenceTransformer('biobert-large-cased-v1.1')  # BioBERT for biomedical text (PubMed/PMC)

# Another option trained on PubMed, focused on biomedical research literature
#model = SentenceTransformer('pubmed-bert-base')  # PubMed BERT for medical research-based text

# Lightweight and fast model, good for semantic similarity tasks, not domain-specific
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Fast, general-purpose model (not specifically clinical)

# General-purpose model fine-tuned on the Semantic Textual Similarity dataset
#model = SentenceTransformer('stsb-roberta-large')  # RoBERTa for general text similarity (clinical possible)

In [10]:
# Generate embeddings for adverse events
embeddings = model.encode(adverse_events)

In [11]:
# Define number of categories (clusters)
num_categories = 25 # You can adjust this number as needed

In [12]:
# Use KMeans clustering to group events based on embeddings
kmeans   = KMeans(n_clusters=num_categories, n_init=10, random_state=0)
clusters = kmeans.fit_predict(embeddings)

In [13]:
# Create a DataFrame to map events to their clusters
df = pd.DataFrame({'Adverse Event': adverse_events, 'Category': clusters})

In [14]:
# Print the resulting categories and their members
for category in range(num_categories):
    print(f"\n\033[32mCategory {category}:\033[0m")
    category_list = df[df['Category'] == category]['Adverse Event'].tolist()
    category_word_list = ", ".join(sorted(category_list)); print(category_word_list)


[32mCategory 0:[0m
Abdominal Wall Haematoma, Acquired Haemophilia, Acquired Von Willebrand^S Disease, Activated Partial Thromboplastin Time Shortened, Acute Leukaemia, Acute Lymphocytic Leukaemia, Acute Lymphocytic Leukaemia (In Remission), Acute Lymphocytic Leukaemia Recurrent, Acute Lymphocytic Leukaemia Refractory, Acute Monocytic Leukaemia, Acute Myeloid Leukaemia, Acute Myeloid Leukaemia Recurrent, Acute Myeloid Leukaemia Refractory, Acute Myelomonocytic Leukaemia, Acute Promyelocytic Leukaemia, Acute Promyelocytic Leukaemia Differentiation Syndrome, Adenocarcinoma Of Colon, Adenoma Benign, Administration Site Vasculitis, Adrenal Haematoma, Adrenal Neoplasm, Adult T-Cell Lymphoma/Leukaemia, Adult T-Cell Lymphoma/Leukaemia Recurrent, Adult T-Cell Lymphoma/Leukaemia Refractory, Agranulocytosis, Alpha Haemolytic Streptococcal Infection, Alveolar Proteinosis, Alveolitis, Amniorrhoea, Amniotic Cavity Infection, Amyotrophic Lateral Sclerosis, Anaemia Megaloblastic, Anaplastic Large C

In [15]:
df

Unnamed: 0,Adverse Event,Category
0,Disease Progression,15
1,Nausea,17
2,Keratopathy,12
3,Interstitial Lung Disease,1
4,Visual Acuity Reduced,3
...,...,...
7493,Throat Lesion,20
7494,Catheter Site Urticaria,12
7495,Body Surface Area Decreased,22
7496,Unable To Afford Prescribed Medication,18


In [16]:
elapsed_time = time.time() - start
minutes, seconds = divmod(elapsed_time, 60)
print("'FAERS_AdEvents_NLP_v001' script run time:", f"{int(minutes)} min {int(seconds)} sec.")

'FAERS_AdEvents_NLP_v001' script run time: 4 min 3 sec.
