In [1]:
# FAERS ROR and PRR analysis with deduplication
# Version v1.0
# Djamilla Simoens

# =============================
# 1️⃣ Install Dependencies
# =============================

# In Jupyter Notebook, run this cell first:
!pip install pandas numpy scipy openpyxl tqdm




In [2]:
# =============================
# 2️⃣ Import Libraries
# =============================

import pandas as pd
import numpy as np
import os
import glob
from scipy.stats import fisher_exact, chi2_contingency
from tqdm import tqdm


In [3]:
# =============================
# 3️⃣ User Input
# =============================
# Type the name of the drug and adverse reactions of interest

drug_synonyms = ['LUXTURNA', 'VORETIGENE NEPARVOVEC']
event_synonyms = ['Retinal degeneration', 'RETINAL ATROPHY', 'FOVEAL ATROPHY', 'RETINAL DEPIGMENTATION', 'INJECTION SITE ATROPHY']
data_dir = './faers_data'  # Update with your FAERS TXT folder

In [4]:
# =============================
# 4️⃣ Load and Concatenate FAERS Data
# =============================

def load_files(folder, pattern):
    files = sorted(glob.glob(os.path.join(folder, pattern)))
    dfs = []
    for file in tqdm(files, desc=f"Loading {pattern.split('*')[0]}"):
        try:
            df = pd.read_csv(file, sep='$', dtype=str, encoding='utf-8', low_memory=False)
            dfs.append(df)
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file, sep='$', dtype=str, encoding='latin1', low_memory=False)
                dfs.append(df)
            except Exception as e:
                print(f"Failed to load {file}: {e}")
        except Exception as e:
            print(f"Failed to load {file}: {e}")
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

df_drug = load_files(data_dir, 'DRUG*.txt')
df_reac = load_files(data_dir, 'REAC*.txt')
print(f"Total DRUG records: {len(df_drug)}")
print(f"Total REACTION records: {len(df_reac)}")


Loading DRUG: 100%|█████████████████████████████| 30/30 [04:23<00:00,  8.79s/it]
Loading REAC: 100%|█████████████████████████████| 30/30 [00:51<00:00,  1.73s/it]


Total DRUG records: 56301263
Total REACTION records: 42727091


In [5]:
# =============================
# 5️⃣ Preprocessing
# =============================

print("Filtering suspect drugs...")
df_drug_ps = df_drug[df_drug['role_cod'].str.upper() == 'PS'].copy()
df_drug_ps['drugname'] = df_drug_ps['drugname'].fillna('').str.upper()
df_reac['pt'] = df_reac['pt'].fillna('').str.upper()

# Normalize and deduplicate drug names using VigiMatch-style rules

def normalize_name(name):
    if not isinstance(name, str):
        return ''
    name = name.upper()
    for synonym in drug_synonyms:
        if synonym.upper() in name:
            return synonym.upper()
    return name

df_drug_ps['drugname_norm'] = df_drug_ps['drugname'].apply(normalize_name)

# Apply VigiMatch-style deduplication: keep only first entry per (primaryid, drugname_norm, route, dose_vbm)
dedup_columns = ['primaryid', 'drugname_norm', 'route', 'dose_vbm']
df_drug_ps_before = len(df_drug_ps)
df_drug_ps_unique = df_drug_ps.sort_values(by='primaryid').drop_duplicates(subset=dedup_columns)
df_drug_ps_after = len(df_drug_ps_unique)
duplicates_removed_drug = df_drug_ps_before - df_drug_ps_after
print(f" VigiMatch-style DRUG duplicates removed: {duplicates_removed_drug} (from {df_drug_ps_before} → {df_drug_ps_after})")


Filtering suspect drugs...
 VigiMatch-style DRUG duplicates removed: 23785 (from 12937001 → 12913216)


In [6]:
# =============================
# 6️⃣ Matching Logic
# =============================

isr_drug = set()
for name in drug_synonyms:
    name = name.upper()
    matches = df_drug_ps_unique[df_drug_ps_unique['drugname_norm'] == name]['primaryid'].unique()
    isr_drug.update(matches)

isr_event = set()
for name in event_synonyms:
    name = name.upper()
    matches = df_reac[df_reac['pt'].str.contains(name, na=False, regex=False)]['primaryid'].unique()
    isr_event.update(matches)


In [7]:
# =============================
# 7️⃣ Contingency Table
# =============================

A = len(isr_drug & isr_event)  # Drug + Event
B = len(isr_drug - isr_event)  # Drug + No Event
C = len(isr_event - isr_drug)  # Event + No Drug
all_isr = set(df_drug['primaryid'].unique())
D = len(all_isr - (isr_drug | isr_event))  # Neither

print(f"Contingency Table:\nA: {A}\nB: {B}\nC: {C}\nD: {D}")


Contingency Table:
A: 59
B: 150
C: 1718
D: 12876858


In [8]:
# =============================
# 8️⃣ Statistics
# =============================

contingency_table = np.array([[A, B], [C, D]])
ROR = (A / B) / (C / D) if B > 0 and C > 0 and D > 0 else np.nan

# PRR calculation
prr_numerator = A / (A + B) if (A + B) > 0 else np.nan
prr_denominator = (A + C) / (A + B + C + D) if (A + B + C + D) > 0 else np.nan
PRR = prr_numerator / prr_denominator if prr_denominator > 0 else np.nan

oddsr, p_fisher = fisher_exact(contingency_table)
chi2, p_chi2, dof, expected = chi2_contingency(contingency_table)
SE = np.sqrt(1/A + 1/B + 1/C + 1/D) if A*B*C*D > 0 else np.nan
lower_CI = np.exp(np.log(ROR) - 1.96 * SE) if not np.isnan(SE) else np.nan
upper_CI = np.exp(np.log(ROR) + 1.96 * SE) if not np.isnan(SE) else np.nan


In [9]:
# =============================
# 9️⃣ Output Results
# =============================

results = {
    'Database': 'FAERS',
    'Drug(s)': ", ".join(drug_synonyms),
    'Event(s)': ", ".join(event_synonyms),
    'A (Drug+Event)': A,
    'B (Drug+No Event)': B,
    'C (Event+No Drug)': C,
    'D (Neither)': D,
    'ROR': ROR,
    'ROR 95% CI Lower': lower_CI,
    'ROR 95% CI Upper': upper_CI,
    'PRR': PRR,
    'Fisher p-value': p_fisher,
    'Chi2 p-value': p_chi2,
    'DRUG Duplicates Removed': duplicates_removed_drug,
    'Original DRUG Records': df_drug_ps_before
}

result_df = pd.DataFrame([results])
print(result_df)

output_file = 'FAERS_ROR_PRR_Results.xlsx'
result_df.to_excel(output_file, index=False)
print(f"Results exported to {output_file}")

  Database                          Drug(s)  \
0    FAERS  LUXTURNA, VORETIGENE NEPARVOVEC   

                                            Event(s)  A (Drug+Event)  \
0  Retinal degeneration, RETINAL ATROPHY, FOVEAL ...              59   

   B (Drug+No Event)  C (Event+No Drug)  D (Neither)          ROR  \
0                150               1718     12876858  2948.135902   

   ROR 95% CI Lower  ROR 95% CI Upper          PRR  Fisher p-value  \
0       2173.375358       3999.081551  2045.941402   4.178955e-176   

   Chi2 p-value  DRUG Duplicates Removed  Original DRUG Records  
0           0.0                    23785               12937001  
Results exported to FAERS_ROR_PRR_Results.xlsx
