# BC5CDR Biomedical NER EDA

Basic exploratory data analysis for the BigBio BC5CDR dataset (KB schema).

In [2]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

pd.set_option("display.max_colwidth", 120)
plt.style.use("seaborn-v0_8")

DATASET_NAME = "bigbio/bc5cdr"
DATASET_CONFIG = "bc5cdr_bigbio_kb"

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Dataset into a DataFrame

Load the dataset and create a lightweight DataFrame for document-level inspection.

In [3]:
# Load dataset
raw = load_dataset(DATASET_NAME, DATASET_CONFIG)
raw

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 500
    })
})

In [5]:
# Show one raw example (train split)
raw["train"][0]

{'id': '0',
 'document_id': '227508',
 'passages': [{'id': '1',
   'type': 'title',
   'text': ['Naloxone reverses the antihypertensive effect of clonidine.'],
   'offsets': [[0, 59]]},
  {'id': '2',
   'type': 'abstract',
   'text': ['In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation in

### Token-level labels example

This shows one document converted into tokens and BIO labels so you can see exactly how labeling works.

In [10]:
# Convert one example to tokens + BIO tags (using the same logic as training)
from src.data_utils import convert_kb_to_tokens_ner

ex0 = raw["train"][1]
converted = convert_kb_to_tokens_ner({"passages": [ex0["passages"]], "entities": [ex0["entities"]]})

tokens = converted["tokens"][0]
ner_tags = converted["ner_tags"][0]

# Show the first 40 token-label pairs
list(zip(tokens, ner_tags))[:40]

[('Lidocaine-induced', 'B-Chemical'),
 ('cardiac', 'B-Disease'),
 ('asystole.', 'I-Disease'),
 ('Intravenous', 'O'),
 ('administration', 'O'),
 ('of', 'O'),
 ('a', 'O'),
 ('single', 'O'),
 ('50-mg', 'O'),
 ('bolus', 'O'),
 ('of', 'O'),
 ('lidocaine', 'B-Chemical'),
 ('in', 'O'),
 ('a', 'O'),
 ('67-year-old', 'O'),
 ('man', 'O'),
 ('resulted', 'O'),
 ('in', 'O'),
 ('profound', 'O'),
 ('depression', 'B-Disease'),
 ('of', 'O'),
 ('the', 'O'),
 ('activity', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('sinoatrial', 'O'),
 ('and', 'O'),
 ('atrioventricular', 'O'),
 ('nodal', 'O'),
 ('pacemakers.', 'O'),
 ('The', 'O'),
 ('patient', 'O'),
 ('had', 'O'),
 ('no', 'O'),
 ('apparent', 'O'),
 ('associated', 'O'),
 ('conditions', 'O'),
 ('which', 'O'),
 ('might', 'O'),
 ('have', 'O')]

In [6]:
from pprint import pprint
pprint(raw["train"][0])

{'coreferences': [],
 'document_id': '227508',
 'entities': [{'id': '3',
               'normalized': [{'db_id': 'D009270', 'db_name': 'MESH'}],
               'offsets': [[0, 8]],
               'text': ['Naloxone'],
               'type': 'Chemical'},
              {'id': '4',
               'normalized': [{'db_id': 'D003000', 'db_name': 'MESH'}],
               'offsets': [[49, 58]],
               'text': ['clonidine'],
               'type': 'Chemical'},
              {'id': '5',
               'normalized': [{'db_id': 'D006973', 'db_name': 'MESH'}],
               'offsets': [[93, 105]],
               'text': ['hypertensive'],
               'type': 'Disease'},
              {'id': '6',
               'normalized': [{'db_id': 'D003000', 'db_name': 'MESH'}],
               'offsets': [[181, 190]],
               'text': ['clonidine'],
               'type': 'Chemical'},
              {'id': '7',
               'normalized': [],
               'offsets': [[244, 252]],
            

In [4]:
# Build a simple document-level DataFrame (id + document text)
# The KB schema stores passages; we will concatenate passages per record for EDA.

def doc_text_from_passages(passages):
    return " ".join([p["text"] for p in passages])

rows = []
for split in raw.keys():
    for ex in raw[split]:
        rows.append({
            "split": split,
            "id": ex.get("id"),
            "document_id": ex.get("document_id"),
            "text": doc_text_from_passages(ex.get("passages", [])),
            "entities": ex.get("entities", []),
        })

df = pd.DataFrame(rows)
df.head(3)

TypeError: sequence item 0: expected str instance, list found

## 2. Initial Data Inspection

Check shape, column names, and sample rows.

In [8]:
df.shape, df.columns

NameError: name 'df' is not defined

In [None]:
df.head(2)

In [None]:
df.tail(2)

df.dtypes

## 3. Data Cleaning and Type Casting

Standardize column names and compute derived lengths.

In [7]:
# Standardize column names
clean_df = df.copy()
clean_df.columns = [c.strip().lower() for c in clean_df.columns]

# Derived features
clean_df["text_len"] = clean_df["text"].str.len()
clean_df["entity_count"] = clean_df["entities"].apply(lambda x: len(x) if isinstance(x, list) else 0)

clean_df.head(3)

NameError: name 'df' is not defined

## 4. Missing Values Analysis

Check missingness per column.

In [None]:
missing = clean_df.isna().mean().sort_values(ascending=False)
missing

## 5. Descriptive Statistics

Summaries for numeric features and split counts.

In [None]:
clean_df[["text_len", "entity_count"]].describe()

clean_df["split"].value_counts()

## 6. Univariate Distributions

Histograms for text length and entity counts.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(clean_df["text_len"], bins=30, ax=axes[0])
axes[0].set_title("Text Length (chars)")

sns.histplot(clean_df["entity_count"], bins=30, ax=axes[1])
axes[1].set_title("Entities per Document")

plt.tight_layout()
plt.show()

## 7. Correlation Analysis and Heatmap

Compute correlations among numeric features.

In [None]:
corr = clean_df[["text_len", "entity_count"]].corr()
plt.figure(figsize=(4, 3))
sns.heatmap(corr, annot=True, cmap="Blues", vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.show()

## 8. Group-by Aggregations

Compare statistics by split.

In [None]:
clean_df.groupby("split")[["text_len", "entity_count"]].agg(["mean", "median", "max"])

## 9. Outlier Detection

Use IQR to flag unusually long documents or entity-rich documents.

In [None]:
def iqr_bounds(series):
    q1, q3 = series.quantile(0.25), series.quantile(0.75)
    iqr = q3 - q1
    return q1 - 1.5 * iqr, q3 + 1.5 * iqr

text_low, text_high = iqr_bounds(clean_df["text_len"])
ent_low, ent_high = iqr_bounds(clean_df["entity_count"])

outliers = clean_df[(clean_df["text_len"] > text_high) | (clean_df["entity_count"] > ent_high)]
outliers[["id", "split", "text_len", "entity_count"]].head(10)

## 10. Feature Engineering Preview

Add a simple density feature: entities per 1k characters.

In [None]:
clean_df["entities_per_1k_chars"] = clean_df["entity_count"] / (clean_df["text_len"] / 1000).replace(0, np.nan)
clean_df[["entities_per_1k_chars"]].describe()

## 11. Export Cleaned Data

Save the derived features to a CSV for downstream analysis.

In [None]:
output_path = "results/eda_bc5cdr_summary.csv"
clean_df[["id", "document_id", "split", "text_len", "entity_count", "entities_per_1k_chars"]].to_csv(output_path, index=False)
print(f"Exported to {output_path}")

In [None]:
clean_df.groupby("split")[["text_len", "entity_count"]].agg(["mean", "median", "max"])