In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m146.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
entity_queries = {
    "disease": [
        "diabetes", "asthma", "hypertension", "tuberculosis", "lung cancer", "breast cancer",
        "colorectal cancer", "alzheimer", "parkinson", "hepatitis", "arthritis", "anemia",
        "myocardial infarction", "stroke", "epilepsy", "glaucoma", "thyroid disorder", "gingivitis",
        "endometriosis", "IBS", "osteoporosis", "leukemia", "melanoma", "sickle cell", "meningitis",
        "pancreatitis", "psoriasis", "chronic kidney disease", "dental caries", "migraine"
    ],
    "symptom": [
        "fatigue", "headache", "abdominal pain", "fever", "blurred vision", "cough", "shortness of breath",
        "chest pain", "joint pain", "dizziness", "nausea", "vomiting", "diarrhea", "weight loss",
        "back pain", "loss of appetite", "itching", "hearing loss", "tremors", "palpitations",
        "urinary incontinence", "constipation", "difficulty swallowing", "dry eyes", "muscle cramps",
        "night sweats", "memory loss", "frequent urination", "skin rash", "tingling sensation"
    ],
    "gene": [
        "TP53", "EGFR", "BRCA1", "BRCA2", "KRAS", "APOE", "TNF", "VEGFA", "CFTR", "MTHFR",
        "IL6", "ESR1", "AKT1", "HER2", "GATA3", "PTEN", "NTRK1", "PIK3CA", "ALK", "CDKN2A",
        "FMR1", "HBB", "MYC", "SMAD4", "FOXP2", "NRAS", "KIT", "NOTCH1", "BRAF", "ERBB2"
    ],
    "protein": [
        "insulin", "hemoglobin", "albumin", "collagen", "cytokine", "troponin", "amyloid beta",
        "elastin", "fibrinogen", "trypsin", "keratin", "myosin", "glucagon", "calcitonin",
        "angiotensin", "histone", "thrombin", "interleukin 6", "alpha fetoprotein", "renin",
        "chymotrypsin", "transferrin", "lactoferrin", "prolactin", "somatostatin",
        "neuropeptide Y", "C reactive protein", "growth hormone", "interferon gamma", "ubiquitin"
    ]
}

In [3]:
from Bio import Entrez
import time

# Use your actual email here (required by NCBI)
Entrez.email = "abouhanezahra@gmail.com"

def fetch_abstracts(query, max_count=50):
    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_count)
        record = Entrez.read(handle)
        ids = record["IdList"]
        handle.close()

        if not ids:
            return ""

        handle = Entrez.efetch(db="pubmed", id=",".join(ids), rettype="abstract", retmode="text")
        abstracts = handle.read()
        handle.close()
        return abstracts
    except Exception as e:
        print(f"Error with query '{query}': {e}")
        return ""

output_file = "pubmed_abstracts.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for category, queries in entity_queries.items():
        for query in queries:
            print(f"[{category}] Fetching abstracts for: {query}")
            abstracts = fetch_abstracts(query, max_count=50)
            if abstracts.strip():
                f.write(f"### {category.upper()}: {query} ###\n{abstracts}\n\n")
            time.sleep(0.5)  # avoid rate-limiting

print(f"Saved all abstracts to {output_file}")


[disease] Fetching abstracts for: diabetes
[disease] Fetching abstracts for: asthma
[disease] Fetching abstracts for: hypertension
[disease] Fetching abstracts for: tuberculosis
[disease] Fetching abstracts for: lung cancer
[disease] Fetching abstracts for: breast cancer
[disease] Fetching abstracts for: colorectal cancer
[disease] Fetching abstracts for: alzheimer
[disease] Fetching abstracts for: parkinson
[disease] Fetching abstracts for: hepatitis
[disease] Fetching abstracts for: arthritis
[disease] Fetching abstracts for: anemia
[disease] Fetching abstracts for: myocardial infarction
[disease] Fetching abstracts for: stroke
[disease] Fetching abstracts for: epilepsy
[disease] Fetching abstracts for: glaucoma
[disease] Fetching abstracts for: thyroid disorder
[disease] Fetching abstracts for: gingivitis
[disease] Fetching abstracts for: endometriosis
[disease] Fetching abstracts for: IBS
[disease] Fetching abstracts for: osteoporosis
[disease] Fetching abstracts for: leukemia
[dis

In [2]:
ambiguous_gene_symbols = [
    "CAT", "ACE", "MAP", "SET", "ARM", "MAX",
    "YES", "MET", "MEN1", "GAS", "WAS"
]

 extract human gene names + synonyms

In [5]:
disease_dict = {
    "diabetes": ["diabetes mellitus", "DM", "type 1 diabetes", "type 2 diabetes"],
    "hypertension": ["high blood pressure", "HTN"],
    "asthma": ["bronchial asthma", "reactive airway disease"],
    "lung cancer": ["pulmonary carcinoma", "bronchogenic carcinoma"],
    "breast cancer": ["mammary carcinoma"],
    "colorectal cancer": ["colon cancer", "bowel cancer"],
    "alzheimer's disease": ["AD", "alzheimer"],
    "parkinson's disease": ["PD", "parkinson"],
    "hepatitis": ["liver inflammation", "hepatic infection"],
    "arthritis": ["joint inflammation", "rheumatoid arthritis", "osteoarthritis"],
    "anemia": ["iron deficiency", "low hemoglobin"],
    "stroke": ["cerebrovascular accident", "brain attack"],
    "myocardial infarction": ["heart attack", "MI"],
    "epilepsy": ["seizure disorder"],
    "psoriasis": ["psoriatic disease"],
    "leukemia": ["blood cancer", "white blood cell cancer"],
    "melanoma": ["skin cancer", "cutaneous melanoma"],
    "thyroid disorder": ["hyperthyroidism", "hypothyroidism", "goiter"],
    "IBS": ["irritable bowel syndrome"],
    "CKD": ["chronic kidney disease", "renal failure"]
}
symptom_dict = {
    "fever": ["pyrexia", "elevated temperature"],
    "headache": ["cephalalgia", "migraine"],
    "abdominal pain": ["stomach ache", "belly pain"],
    "fatigue": ["tiredness", "exhaustion"],
    "nausea": ["queasiness", "sickness"],
    "vomiting": ["emesis", "throwing up"],
    "diarrhea": ["loose stools", "frequent bowel movements"],
    "shortness of breath": ["dyspnea", "breathlessness"],
    "cough": ["dry cough", "productive cough"],
    "chest pain": ["angina", "thoracic pain"],
    "joint pain": ["arthralgia"],
    "back pain": ["lumbago"],
    "dizziness": ["vertigo", "lightheadedness"],
    "itching": ["pruritus"],
    "rash": ["skin eruption", "dermatitis"],
    "loss of appetite": ["anorexia"],
    "weight loss": ["involuntary weight loss"],
    "memory loss": ["amnesia"],
    "night sweats": ["nocturnal sweating"],
    "tingling sensation": ["paresthesia"]
}
gene_dict = {
    "TP53": ["P53", "BCC7", "LFS1"],
    "BRCA1": ["BRCC1", "RNF53"],
    "BRCA2": ["FAD1", "FAD", "FANCD1"],
    "EGFR": ["ERBB", "ERBB1"],
    "KRAS": ["C-K-RAS", "RASK2"],
    "APOE": ["Apolipoprotein E"],
    "MTHFR": ["methylenetetrahydrofolate reductase"],
    "IL6": ["interleukin 6"],
    "TNF": ["tumor necrosis factor", "TNFA"],
    "VEGFA": ["vascular endothelial growth factor A"],
    "PIK3CA": ["phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha"],
    "ALK": ["anaplastic lymphoma kinase"],
    "HER2": ["ERBB2", "neu", "CD340"],
    "BRAF": ["proto-oncogene B-Raf"],
    "MYC": ["c-Myc", "avian myelocytomatosis viral oncogene homolog"]
}
protein_dict = {
    "insulin": ["insulin hormone"],
    "hemoglobin": ["Hb", "oxygen transport protein"],
    "albumin": ["serum albumin", "ALB"],
    "collagen": ["connective tissue protein"],
    "cytokine": ["inflammatory mediator", "signaling protein"],
    "amyloid beta": ["beta amyloid", "Aβ peptide"],
    "calcitonin": ["CT", "thyrocalcitonin"],
    "angiotensin": ["angiotensin I", "angiotensin II"],
    "interleukin 6": ["IL6"],
    "alpha fetoprotein": ["AFP"],
    "C reactive protein": ["CRP"],
    "growth hormone": ["GH", "somatotropin"],
    "interferon gamma": ["IFN-gamma", "IFNG"],
    "thrombin": ["coagulation factor II"],
    "elastin": ["connective tissue elastin"]
}


# Required Preprocessing Before Distant Supervision

In [3]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Step 4: Flatten Dictionaries + Annotate Function

In [6]:
from itertools import chain

def flatten_dict(entity_dict, label, ambiguous_filter=None):
    term_map = {}
    for main, synonyms in entity_dict.items():
        for term in [main] + synonyms:
            term_lc = term.lower()
            if not (ambiguous_filter and term.upper() in ambiguous_filter and label == "GENE"):
                term_map[term_lc] = label
    return term_map

entity_terms = {}
entity_terms.update(flatten_dict(disease_dict, "DISEASE", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(symptom_dict, "SYMPTOM", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(gene_dict, "GENE", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(protein_dict, "PROTEIN", ambiguous_gene_symbols))

def annotate_with_bio(text, entity_terms):
    doc = nlp(text)
    labels = ["O"] * len(doc)

    for i in range(len(doc)):
        for j in range(i + 1, min(i + 10, len(doc)) + 1):
            span = doc[i:j]
            span_text = span.text.lower()
            if span_text in entity_terms and all(labels[k] == "O" for k in range(i, j)):
                label = entity_terms[span_text]
                labels[i] = f"B-{label}"
                for k in range(i + 1, j):
                    labels[k] = f"I-{label}"
                break

    return [(token.text, label) for token, label in zip(doc, labels)]


🔹 Step 5: Process the File and Save Output in CoNLL Format
python
Copier
Modifier


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
# Read file
with open("/content/drive/MyDrive/pubmed_abstracts.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Clean and process
bio_corpus = []
for line in lines:
    line = line.strip()
    if not line or line.startswith("###"):  # skip empty and headers
        continue
    tokens = annotate_with_bio(line, entity_terms)
    for word, label in tokens:
        bio_corpus.append(f"{word}\t{label}")
    bio_corpus.append("")  # sentence break

# Save as CoNLL file
with open("/content/drive/MyDrive/pubmed_abstracts_labled.txt", "w", encoding="utf-8") as out:
    out.write("\n".join(bio_corpus))


TSV format conversion for BioBERT

In [12]:
from google.colab import files
uploaded = files.upload()


Saving pubmed_abstracts.txt to pubmed_abstracts.txt


In [10]:
from itertools import chain

def flatten_dict(entity_dict, label, ambiguous_filter=None):
    term_map = {}
    for main, synonyms in entity_dict.items():
        for term in [main] + synonyms:
            term_lc = term.lower()
            if not (ambiguous_filter and term.upper() in ambiguous_filter and label == "GENE"):
                term_map[term_lc] = label
    return term_map

entity_terms = {}
entity_terms.update(flatten_dict(disease_dict, "DISEASE", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(symptom_dict, "SYMPTOM", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(gene_dict, "GENE", ambiguous_gene_symbols))
entity_terms.update(flatten_dict(protein_dict, "PROTEIN", ambiguous_gene_symbols))

def annotate_with_bio(text, entity_terms):
    doc = nlp(text)
    labels = ["O"] * len(doc)

    for i in range(len(doc)):
        for j in range(i + 1, min(i + 10, len(doc)) + 1):
            span = doc[i:j]
            span_text = span.text.lower()
            if span_text in entity_terms and all(labels[k] == "O" for k in range(i, j)):
                label = entity_terms[span_text]
                labels[i] = f"B-{label}"
                for k in range(i + 1, j):
                    labels[k] = f"I-{label}"
                break

    return [(token.text, label) for token, label in zip(doc, labels)]


In [13]:
input_filename = next(iter("/content/drive/MyDrive/pubmed_abstracts.txt"))

with open(input_filename, "r", encoding="utf-8") as f:
    lines = f.readlines()

tsv_lines = ["Token\tLabel"]
for line in lines:
    line = line.strip()
    if not line or line.startswith("###"):
        continue
    annotated = annotate_with_bio(line, entity_terms)
    for word, tag in annotated:
        tsv_lines.append(f"{word}\t{tag}")
    tsv_lines.append("")  # Optional: sentence separator

with open("biobert_ner_data.tsv", "w", encoding="utf-8") as f_out:
    f_out.write("\n".join(tsv_lines))


Explore data volume

* 4 entity types: disease, symptom, gene, protein

* 30 queries per type

* 50 abstracts per query

* 30 * 50 = 1500 per entity type

* 4 * 1500 = 6000 absracts in total

In [16]:
import os

# List of your target files
file_names = [
    "/content/drive/MyDrive/pubmed_abstracts.txt",
    "/content/drive/MyDrive/pubmed_abstracts_labled.txt",
    "/content/drive/MyDrive/biobert_ner_data.tsv",
    # "train.conll",
    # "dev.conll",
    # "test.conll"
]

# Print size for each file in MB
for file in file_names:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024 * 1024)
        print(f"{file}: {size_mb:.2f} MB")
    else:
        print(f"{file}: File not found")


/content/drive/MyDrive/pubmed_abstracts.txt: 17.44 MB
/content/drive/MyDrive/pubmed_abstracts_labled.txt: 24.72 MB
/content/drive/MyDrive/biobert_ner_data.tsv: 24.72 MB


# Training Models

In [3]:
!pip install seqeval scikit-learn



# ✅ Full BiLSTM Model Training Code (no CRF yet)

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report

# ✅ Step 1: Read CoNLL-formatted file safely
def read_conll(path):
    sentences, tags = [], []
    words, labels = [], []

    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # Empty line = sentence boundary
            if not line:
                if words:
                    sentences.append(words)
                    tags.append(labels)
                    words, labels = [], []
                continue

            # Skip header or malformed lines
            if line.startswith("Token") or len(line.split()) != 2:
                continue

            word, label = line.split()
            words.append(word)
            labels.append(label)

    if words:
        sentences.append(words)
        tags.append(labels)

    return sentences, tags

# ✅ Step 2: Build vocab
def build_vocab(sequences):
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for sent in sequences:
        for token in sent:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

def encode_tokens(seq, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in seq]

def encode_labels(seq, tag2idx):
    return [tag2idx[label] for label in seq]


# ✅ Step 3: Dataset
class NERDataset(Dataset):
    def __init__(self, X, Y, word2idx, tag2idx):
        self.X = [encode_tokens(x, word2idx) for x in X]
        self.Y = [encode_labels(y, tag2idx) for y in Y]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.Y[idx])

# ✅ Step 4: BiLSTM model (without CRF)
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        return self.fc(x)

# ✅ Step 5: Load and encode data
sentences, tags = read_conll("/content/drive/MyDrive/pubmed_abstracts_labled.txt")
tag2idx = {t: i for i, t in enumerate(set(t for tag_seq in tags for t in tag_seq))}
word2idx = build_vocab(sentences)

X_train, X_test, y_train, y_test = train_test_split(sentences, tags, test_size=0.1, random_state=42)

train_dataset = NERDataset(X_train, y_train, word2idx, tag2idx)
test_dataset = NERDataset(X_test, y_test, word2idx, tag2idx)

from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    xs, ys = zip(*batch)
    xs_padded = pad_sequence(xs, batch_first=True, padding_value=0)
    ys_padded = pad_sequence(ys, batch_first=True, padding_value=-100)  # -100 = ignore index for loss
    return xs_padded, ys_padded

# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: x[0])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


In [7]:
import torch
from tqdm import tqdm  # <-- New import

# ✅ Detect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Model + optimizer + loss
model = BiLSTM(len(word2idx), len(tag2idx)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ✅ Training loop with tqdm and debug steps
for epoch in range(5):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}")

    for step, (x, y) in enumerate(tqdm(train_loader)):
        x, y = x.to(device), y.to(device)

        y_pred = model(x)
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = y.view(-1)

        loss = loss_fn(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # ✅ DEBUG: Print current step
       # print(f"  Step {step} - Batch loss: {loss.item():.4f}")

        # Optional early break (for testing)
        # if step > 5:
        #     break

    print(f"Epoch {epoch + 1} - Total Loss: {total_loss:.4f}")


Using device: cuda

Epoch 1


100%|██████████| 7885/7885 [01:06<00:00, 119.37it/s]


Epoch 1 - Total Loss: 94.1264

Epoch 2


100%|██████████| 7885/7885 [01:08<00:00, 115.57it/s]


Epoch 2 - Total Loss: 5.5261

Epoch 3


100%|██████████| 7885/7885 [01:05<00:00, 120.69it/s]


Epoch 3 - Total Loss: 1.8800

Epoch 4


100%|██████████| 7885/7885 [01:05<00:00, 120.81it/s]


Epoch 4 - Total Loss: 0.6919

Epoch 5


100%|██████████| 7885/7885 [01:05<00:00, 121.01it/s]

Epoch 5 - Total Loss: 0.6565





In [31]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([11]) torch.Size([11])


In [32]:
y_pred = model(x)
print("Pred shape:", y_pred.shape)
print("Target shape:", y.view(-1).shape)


Pred shape: torch.Size([11, 9])
Target shape: torch.Size([11])


In [8]:
# ✅ Save the trained model
torch.save(model.state_dict(), "/content/drive/MyDrive/NER_models/bilstm_baseline.pt")


In [9]:
# Recreate the model architecture first
model = BiLSTM(len(word2idx), len(tag2idx))
model.load_state_dict(torch.load("/content/drive/MyDrive/NER_models/bilstm_baseline.pt"))
model.eval()


BiLSTM(
  (embedding): Embedding(135162, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
)

🔁 Rebuild and Load the Model

In [19]:
# Rebuild architecture
model = BiLSTM(len(word2idx), len(tag2idx))
model.load_state_dict(torch.load("/content/drive/MyDrive/NER_models/bilstm_baseline.pt"))
model.eval().to(device)


BiLSTM(
  (embedding): Embedding(135162, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
)

🧪 Evaluation Loop for BiLSTM (No CRF)


In [22]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for x_batch, y_batch, mask in test_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        mask = mask.to(device)

        y_pred = model(x_batch)                # shape: [batch, seq_len, num_tags]
        y_pred = y_pred.argmax(dim=-1)         # take predicted class

        for pred, true, m in zip(y_pred, y_batch, mask):
            pred = pred[m].tolist()            # keep only non-pad predictions
            true = true[m].tolist()
            all_preds.append([idx2tag[i] for i in pred])
            all_labels.append([idx2tag[i] for i in true])

from seqeval.metrics import classification_report
print(classification_report(all_labels, all_preds))


              precision    recall  f1-score   support

     DISEASE       0.00      0.00      0.00       878
        GENE       0.00      0.00      0.00       376
     PROTEIN       0.00      0.00      0.00       320
     SYMPTOM       0.00      0.00      0.00       357

   micro avg       0.00      0.00      0.00      1931
   macro avg       0.00      0.00      0.00      1931
weighted avg       0.00      0.00      0.00      1931



Now Let's Add the CRF Layer

✅ 1. 📦 Install CRF Library

In [1]:
!pip install torchcrf



# ✅ 2. 🧠 Updated BiLSTM + CRF Model

In [None]:
import os
os.kill(os.getpid(), 9)


In [3]:
!pip uninstall -y torchcrf
!pip install git+https://github.com/kmkurn/pytorch-crf.git

Found existing installation: TorchCRF 1.1.0
Uninstalling TorchCRF-1.1.0:
  Successfully uninstalled TorchCRF-1.1.0
Collecting git+https://github.com/kmkurn/pytorch-crf.git
  Cloning https://github.com/kmkurn/pytorch-crf.git to /tmp/pip-req-build-j15k_84v
  Running command git clone --filter=blob:none --quiet https://github.com/kmkurn/pytorch-crf.git /tmp/pip-req-build-j15k_84v
  Resolved https://github.com/kmkurn/pytorch-crf.git to commit 623e3402d00a2728e99d6e8486010d67c754267b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-crf
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-crf: filename=pytorch_crf-0.7.2-py3-none-any.whl size=6410 sha256=29dd993c75a44fe3e3aa3d9849f6d1483c359decd6d8f25a78e4173acfe6350d
  Stored in directory: /tmp/pip-ephem-wheel-cache-qkzvaob4/wheels/fd/83/cc/f11543939f8911b8dcff86e2bd54423e21f779d0938958cc7f
Successfully built pytorch-crf
Installing collected packages:

In [4]:
from torchcrf import CRF

In [7]:
# from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        emissions = self.fc(x)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)


✅ 3. 🧪 Training Loop With CRF

In [9]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    xs, ys = zip(*batch)
    xs_pad = pad_sequence(xs, batch_first=True, padding_value=0)
    ys_pad = pad_sequence(ys, batch_first=True, padding_value=0)
    mask = xs_pad != 0
    return xs_pad, ys_pad, mask


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

import torch
from tqdm import tqdm

# ✅ Detect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Move model to device
model = BiLSTM_CRF(len(word2idx), len(tag2idx)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ✅ Train loop with tqdm + optional early break
for epoch in range(5):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}")

    for step, (x_batch, y_batch, mask) in enumerate(tqdm(train_loader)):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        mask = mask.to(device)

        loss = model(x_batch, tags=y_batch, mask=mask)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Optional early stopping for debugging
        # if step > 50: break

    print(f"Epoch {epoch + 1} - Total Loss: {total_loss:.4f}")


Using device: cuda

Epoch 1


100%|██████████| 31538/31538 [14:07<00:00, 37.19it/s]


Epoch 1 - Total Loss: 1938.6700

Epoch 2


100%|██████████| 31538/31538 [14:02<00:00, 37.44it/s]


Epoch 2 - Total Loss: 109.2750

Epoch 3


100%|██████████| 31538/31538 [13:55<00:00, 37.75it/s]


Epoch 3 - Total Loss: 38.4717

Epoch 4


100%|██████████| 31538/31538 [14:05<00:00, 37.31it/s]


Epoch 4 - Total Loss: 15.7936

Epoch 5


100%|██████████| 31538/31538 [15:20<00:00, 34.28it/s]

Epoch 5 - Total Loss: 23.1326





In [10]:
# ✅ Create a save path
save_path = "/content/drive/MyDrive/NER_models/bilstm_crf.pt"

# ✅ Save model weights
torch.save(model.state_dict(), save_path)

print(f"Model saved to {save_path}")


Model saved to /content/drive/MyDrive/NER_models/bilstm_crf.pt


In [11]:
# Rebuild the model architecture first
model = BiLSTM_CRF(len(word2idx), len(tag2idx)).to(device)

# Load the trained weights
model.load_state_dict(torch.load("/content/drive/MyDrive/NER_models/bilstm_crf.pt"))
model.eval()  # set to evaluation mode


BiLSTM_CRF(
  (embedding): Embedding(135162, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
  (crf): CRF(num_tags=9)
)

In [12]:
torch.save({
    'model_state_dict': model.state_dict(),
    'word2idx': word2idx,
    'tag2idx': tag2idx
}, "/content/drive/MyDrive/NER_models/bilstm_crf_full.pth")


In [13]:
checkpoint = torch.load("/content/drive/MyDrive/NER_models/bilstm_crf_full.pth")

# Rebuild the same model shape
model = BiLSTM_CRF(len(checkpoint['word2idx']), len(checkpoint['tag2idx']))
model.load_state_dict(checkpoint['model_state_dict'])

# Restore vocab + label mappings
word2idx = checkpoint['word2idx']
tag2idx = checkpoint['tag2idx']


In [15]:
idx2tag = {i: tag for tag, i in tag2idx.items()}

In [18]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for x_batch, y_batch, mask in test_loader:
        preds = model(x_batch, mask=mask)
        for pred, true, m in zip(preds, y_batch, mask):
            true = true[m].tolist()  # filter out padding
            all_preds.append([idx2tag[i] for i in pred])
            all_labels.append([idx2tag[i] for i in true])

from seqeval.metrics import classification_report
print(classification_report(all_labels, all_preds))


              precision    recall  f1-score   support

     DISEASE       1.00      1.00      1.00       878
        GENE       1.00      1.00      1.00       376
     PROTEIN       1.00      0.99      1.00       320
     SYMPTOM       0.99      1.00      1.00       357

   micro avg       1.00      1.00      1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



----------------------BioBERT Training---------------------

# ✅ Step-by-Step Training Pipeline for BioBERT

In [5]:
!pip install transformers datasets seqeval

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, tokenizers, transformers
Successfully installed huggingface-hub-0.32.4 tokenizers-0.21.1 transformers-

📂 Step 1: Load Your TSV File in BIO Format

✅ Step 2: Convert to HuggingFace Dataset

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

def read_bio_tsv(file_path):
    sentences, labels = [], []  # <-- FIXED HERE
    words, tags = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
                continue
            if line.startswith("Token"):  # skip header if exists
                continue
            parts = line.split()
            if len(parts) != 2:
                continue  # skip malformed lines
            token, label = parts
            words.append(token)
            tags.append(label)

    if words:
        sentences.append(words)
        labels.append(tags)

    return sentences, labels


# Load your data
tokens, tags = read_bio_tsv("/content/drive/MyDrive/biobert_ner_data.tsv")

# Build label list
unique_tags = sorted(set(tag for seq in tags for tag in seq))
label2id = {l: i for i, l in enumerate(unique_tags)}
id2label = {i: l for l, i in label2id.items()}

# Encode labels
tag_ids = [[label2id[t] for t in seq] for seq in tags]

# Split and create HuggingFace datasets
train_tokens, test_tokens, train_tags, test_tags = train_test_split(tokens, tag_ids, test_size=0.1, random_state=42)

train_dataset = Dataset.from_dict({'tokens': train_tokens, 'labels': train_tags})
test_dataset = Dataset.from_dict({'tokens': test_tokens, 'labels': test_tags})
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

✅ Step 3: Tokenize for BioBERT

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=True  # Ensure consistent length
    )

    labels = []
    for i, label in enumerate(examples["labels"]):  # FIXED HERE
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/252300 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/28034 [00:00<?, ? examples/s]

✅ Step 4: Load BioBERT + Token Classification Head

In [5]:
!pip uninstall -y transformers tokenizers huggingface_hub


[0m

In [6]:
!pip install transformers==4.35.2 tokenizers==0.15.0 datasets==2.14.6 seqeval


Collecting transformers==4.35.2
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting tokenizers==0.15.0
  Using cached tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.35.2)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Using cached transformers-4.35.2-py3-none-any.whl (7.9 MB)
Using cached tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.1/512.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, tokenizers, transformers
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformer

In [1]:
import transformers
print(transformers.__version__)


4.35.2


  _torch_pytree._register_pytree_node(


In [5]:
!rm -r ~/.cache/huggingface


In [4]:
!pip uninstall -y transformers tokenizers huggingface_hub
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers*
!rm -rf /usr/local/lib/python3.11/dist-packages/tokenizers*
!rm -rf /usr/local/lib/python3.11/dist-packages/huggingface_hub*


Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
Found existing installation: tokenizers 0.15.0
Uninstalling tokenizers-0.15.0:
  Successfully uninstalled tokenizers-0.15.0
Found existing installation: huggingface-hub 0.32.4
Uninstalling huggingface-hub-0.32.4:
  Successfully uninstalled huggingface-hub-0.32.4


In [3]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Step 5: Train the Model

In [34]:
!pip install -U transformers datasets seqeval

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [5]:
import transformers
print(transformers.__version__)

4.52.4


In [6]:
!pip install -U transformers==4.35.2 datasets==2.14.6 seqeval

Collecting transformers==4.35.2
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/123.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.6
  Downloading datasets-2.14.6-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.14.6)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-2.14.6-py3-none-any.whl (4

In [1]:
!pip install -U tokenizers==0.15.0

Collecting tokenizers==0.15.0
  Downloading tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.35.2 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.15.0


In [4]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

In [5]:
args = TrainingArguments(
    output_dir="./biobert_clinicalner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,  # (optional, warning is okay)
    data_collator=data_collator  # ✅ This is CRITICAL
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabouhanezahra[0m ([33mabouhanezahra-uca[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


RuntimeError: The size of tensor a (1130) must match the size of tensor b (512) at non-singleton dimension 1

✅ Step 6: Evaluate the Model

In [None]:
from seqeval.metrics import classification_report

def align_predictions(predictions, label_ids):
    preds = predictions.argmax(-1)
    batch_size, seq_len = preds.shape
    out_preds, out_labels = [], []

    for i in range(batch_size):
        pred_seq, label_seq = [], []
        for j in range(seq_len):
            if label_ids[i][j] != -100:
                pred_seq.append(id2label[preds[i][j]])
                label_seq.append(id2label[label_ids[i][j]])
        out_preds.append(pred_seq)
        out_labels.append(label_seq)

    return out_preds, out_labels

predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
preds, refs = align_predictions(predictions, labels)

print(classification_report(refs, preds))


In [1]:
!pip uninstall -y transformers tokenizers huggingface_hub peft
!pip install transformers==4.35.2 tokenizers==0.15.0 datasets==2.14.6 seqeval peft

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
Found existing installation: tokenizers 0.15.0
Uninstalling tokenizers-0.15.0:
  Successfully uninstalled tokenizers-0.15.0
Found existing installation: huggingface-hub 0.32.4
Uninstalling huggingface-hub-0.32.4:
  Successfully uninstalled huggingface-hub-0.32.4
Found existing installation: peft 0.15.2
Uninstalling peft-0.15.2:
  Successfully uninstalled peft-0.15.2
Collecting transformers==4.35.2
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting tokenizers==0.15.0
  Using cached tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting peft
  Using cached peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.35.2)
  Using cached huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Using cached transformers-4.35.2-py3-none-

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens get a -100 label
            if word_idx is None:
                label_ids.append(-100)
            # Only label the first token of a multi-token word
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, padding=True) # Add padding=True here

# The rest of the code for defining the model and training will go in the next cell.

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/252300 [00:00<?, ? examples/s]

KeyError: 'ner_tags'

In [14]:
model_checkpoint = "dmis-lab/biobert-v1.1"