In [None]:
# EudraVigilance Line Listing Analysis with VigiMatch-style Deduplication
# Author: Djamilla Simoens (2025)
# Version: 1.0 

# =============================
# Install Dependencies
# =============================

# In Jupyter Notebook, run this cell first only when needed:
!pip install pandas numpy scipy openpyxl tqdm

In [None]:
# =============================
# Import Libraries
# =============================

import pandas as pd
import numpy as np
import os
from scipy.stats import fisher_exact, chi2_contingency
from tqdm import tqdm
from hashlib import sha256

In [None]:
# =============================
# Load excel document 
# =============================
file_path = "eudravigilance_line_listing.xlsx"  # Replace with your file

df = pd.read_excel(file_path)
print(f"Loaded {len(df)} rows")

In [None]:
# =============================
# 4. Preprocess Columns
# =============================
# Normalize column names

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df = df.fillna("")


# Normalize textual content

def normalize(text):
    return str(text).strip().lower()

# If reaction terms are grouped with ';', split them later
df['reaction_term'] = df['reaction_term'].apply(normalize)
df['drug_name'] = df['drug_name'].apply(normalize)
df['sex'] = df['sex'].apply(normalize)

# =============================
# 5. Create VigiMatch-style Deduplication Key
# =============================
def create_match_key(row):
    components = [
        str(row.get('age', '')),
        str(row.get('sex', '')),
        str(row.get('drug_name', '')),
        str(row.get('reaction_term', '')),
        str(row.get('reporter_country', '')),
        str(row.get('reported_date', ''))[:10],  # Date only
    ]
    key_string = "|".join(components)
    return sha256(key_string.encode()).hexdigest()

df['match_key'] = df.apply(create_match_key, axis=1)

# =============================
# 6. Deduplicate Records
# =============================
df_dedup = df.drop_duplicates(subset=['match_key'])

print(f"\n Duplicates Removed {len(df) - len(df_dedup)} duplicate reports")
print(f" Deduplicated count: {len(df_dedup)}")

# =============================
# 7. Split reaction terms into individual rows
# =============================
reaction_split = (
    df_dedup.assign(reaction_term=df_dedup['reaction_term'].str.split(';'))
    .explode('reaction_term')
)
reaction_split['reaction_term'] = reaction_split['reaction_term'].str.strip()

# 8. Basic Analysis
print("\nReaction term counts:")
all_reactions = reaction_split['reaction_term'].value_counts()
print(all_reactions)

if 'serious' in reaction_split.columns:
    print("\nSerious vs. Non-serious:")
    seriousness_counts = reaction_split['serious'].value_counts()
    print(seriousness_counts)
else:
    seriousness_counts = pd.Series(dtype=int)

# 9. Export Deduplicated File
with pd.ExcelWriter("deduplicated_output.xlsx") as writer:
    df_dedup.to_excel(writer, sheet_name="Deduplicated Data", index=False)
    reaction_split.to_excel(writer, sheet_name="Expanded Reactions", index=False)
    all_reactions.to_frame(name="count").to_excel(writer, sheet_name="Reaction Counts")
    seriousness_counts.to_frame(name="count").to_excel(writer, sheet_name="Seriousness Summary")

print("\n Exported deduplicated data and analysis to 'deduplicated_output.xlsx'")
