In [1]:
# Alberto Bejarano [2025]
# FAERS_AdEvents_NLP_v001

In [2]:
# FAERS_AdEvents_NLP_v001
# This code applies Natural Language Processing (NLP) and machine learning to categorize adverse medical events based on semantic similarity. Each event is embedded into a vector representation
# using a pre-trained DistilBERT model, capturing the underlying meaning of the text. The K-Means clustering algorithm (KMeans from scikit-learn) then groups these vectors into 10 clusters,
# revealing patterns of similar adverse events. A Pandas DataFrame is created to map each event to its assigned cluster, and the results are printed to display the events grouped into distinct 
# categories. This process helps efficiently organize and analyze adverse events data for pattern recognition and deeper insights.

In [3]:
# !pip install sentence-transformers

In [4]:
import time; start = time.time()
from datetime import datetime; print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

April 21, 2025 18:59:18


In [5]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [7]:
# Sample list of adverse events
adverse_events =  ['Abasia', 'Abdominal Abscess', 'Abdominal Adhesions', 'Abdominal Compartment Syndrome', 'Abdominal Discomfort', 'Abdominal Distension', 
                   'Abdominal Incarcerated Hernia', 'Abdominal Infection', 'Abdominal Injury', 'Abdominal Lymphadenopathy', 'Abdominal Mass', 'Abdominal Neoplasm', 
                   'Abdominal Operation', 'Abdominal Pain', 'Abdominal Pain Lower', 'Abdominal Pain Upper', 'Abdominal Rigidity', 'Abdominal Sepsis', 'Abdominal Symptom',
                   'Abdominal Tenderness', 'Abdominal Wall Abscess', 'Abnormal Behaviour', 'Abnormal Dreams', 'Abnormal Faeces', 'Abnormal Loss Of Weight', 
                   'Abnormal Sensation In Eye', 'Abnormal Uterine Bleeding', 'Abnormal Weight Gain', 'Abortion', 'Abortion Induced', 'Abortion Spontaneous', 
                   'Abscess', 'Abscess Fungal', 'Abscess Intestinal', 'Abscess Limb', 'Abscess Neck', 'Abulia', 'Acanthamoeba Infection', 'Acarodermatitis',
                   'Accelerated Idioventricular Rhythm', 'Accident', 'Accidental Exposure', 'Accidental Exposure To Product', 'Accidental Overdose', 'Accommodation Disorder']

In [8]:
with open("./data/adverse_event_list.txt", "r") as f:
    adverse_events = [line.strip() for line in f]
print(len(adverse_events))
word_list = ", ".join(adverse_events[:10]); print(word_list)

3430
Febrile Neutropenia, Cardiotoxicity, Malaise, Mental Disorder, General Physical Health Deterioration, Anaemia, Delirium, Cerebral Haemorrhage, Eastern Cooperative Oncology Group Performance Status Worsened, Infusion Related Reaction


In [9]:
# Load pre-trained model for embeddings
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [10]:
# For domain-specific medical text, pre-trained on clinical data
# ClinicalBERT, fine-tuned on clinical text (e.g., clinical trials, medical literature)
#model = SentenceTransformer('emilyalsentzer/Bio_ClinicalBERT')  # Clinical BERT for clinical trial data

# For biomedical texts, trained on PubMed and PMC articles
#model = SentenceTransformer('biobert-large-cased-v1.1')  # BioBERT for biomedical text (PubMed/PMC)

# Another option trained on PubMed, focused on biomedical research literature
#model = SentenceTransformer('pubmed-bert-base')  # PubMed BERT for medical research-based text

# Lightweight and fast model, good for semantic similarity tasks, not domain-specific
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Fast, general-purpose model (not specifically clinical)

# General-purpose model fine-tuned on the Semantic Textual Similarity dataset
#model = SentenceTransformer('stsb-roberta-large')  # RoBERTa for general text similarity (clinical possible)

In [11]:
# Generate embeddings for adverse events
embeddings = model.encode(adverse_events)

In [12]:
# Define number of categories (clusters)
num_categories = 30 # You can adjust this number as needed

In [13]:
# Use KMeans clustering to group events based on embeddings
kmeans   = KMeans(n_clusters=num_categories, n_init=10, random_state=0)
clusters = kmeans.fit_predict(embeddings)

In [14]:
# Create a DataFrame to map events to their clusters
df = pd.DataFrame({'Adverse Event': adverse_events, 'Category': clusters})

In [15]:
# Print the resulting categories and their members
for category in range(num_categories):
    print(f"\n\033[32mCategory {category}:\033[0m")
    category_list = df[df['Category'] == category]['Adverse Event'].tolist()
    category_word_list = ", ".join(sorted(category_list)); print(category_word_list)


[32mCategory 0:[0m
Abscess Limb, Acetabulum Fracture, Ankle Fracture, Atypical Femur Fracture, Cervical Vertebral Fracture, Clavicle Fracture, Compression Fracture, Contusion, Device Dislocation, Duodenal Perforation, Duodenitis, Femoral Neck Fracture, Femur Fracture, Fibula Fracture, Finger Amputation, Fracture, Fracture Displacement, Fractured Sacrum, Hand Fracture, Hip Fracture, Hip Surgery, Joint Ankylosis, Joint Dislocation, Joint Effusion, Joint Injury, Joint Range Of Motion Decreased, Joint Stiffness, Joint Swelling, Leg Amputation, Limb Injury, Limb Mass, Long Qt Syndrome, Lower Limb Fracture, Lumbar Vertebral Fracture, Metastases To Muscle, Metastases To Neck, Metastases To Spine, Multiple Fractures, Muscle Atrophy, Muscle Contracture, Muscle Disorder, Muscle Fatigue, Muscle Injury, Muscle Spasms, Muscle Spasticity, Muscle Strain, Muscle Tightness, Muscle Twitching, Muscular Weakness, Musculoskeletal Stiffness, Neck Injury, Neck Mass, Nerve Injury, Nerve Root Compression, P

In [16]:
df

Unnamed: 0,Adverse Event,Category
0,Febrile Neutropenia,10
1,Cardiotoxicity,28
2,Malaise,15
3,Mental Disorder,17
4,General Physical Health Deterioration,27
...,...,...
3425,Serotonin Syndrome,11
3426,Mydriasis,20
3427,Disorganised Speech,15
3428,Anal Inflammation,6


In [17]:
elapsed_time = time.time() - start
minutes, seconds = divmod(elapsed_time, 60)
print("'FAERS_AdEvents_NLP_v001' script run time:", f"{int(minutes)} min {int(seconds)} sec.")

'FAERS_AdEvents_NLP_v001' script run time: 3 min 1 sec.
