In [1]:
!pip install transformers datasets seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import seqeval.metrics
import warnings

warnings.filterwarnings('ignore')

print("--- Step 0: All libraries installed successfully! ---")

--- Step 0: All libraries installed successfully! ---


In [3]:
# -----------------------------------------------------------------
# STEP 1: LOAD DATA
# -----------------------------------------------------------------
# We will download the pre-processed IOB files directly from the GitHub repo.
!wget https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/BC5CDR-IOB/train.tsv -O train.tsv -q
!wget https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/BC5CDR-IOB/devel.tsv -O devel.tsv -q
!wget https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/BC5CDR-IOB/test.tsv -O test.tsv -q

print("\n--- Step 1: All data files downloaded successfully! ---")
print("Files: train.tsv, devel.tsv, test.tsv")


--- Step 1: All data files downloaded successfully! ---
Files: train.tsv, devel.tsv, test.tsv


In [2]:
# STEP 2: PARSE & LOAD DATA INTO 'datasets' FORMAT
# -----------------------------------------------------------------
# We need to convert the .tsv files (word \t label) into a
# format that the 'datasets' library understands (a list of sentences,
# where each sentence is a dict of 'tokens' and 'ner_tags').

def read_ner_file(file_path):
    """Reads a .tsv file and converts it to the 'datasets' format."""
    sentences = []
    tokens = []
    ner_tags = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                # This is a blank line, which signifies the end of a sentence.
                if tokens:
                    sentences.append({'tokens': tokens, 'ner_tags': ner_tags})
                    tokens = []
                    ner_tags = []
            else:
                # This is a word and its tag
                try:
                    word, label = line.split('\t')
                    tokens.append(word)
                    ner_tags.append(label)
                except ValueError:
                    # Handle potential malformed lines
                    pass

        # Add the last sentence if the file doesn't end with a blank line
        if tokens:
            sentences.append({'tokens': tokens, 'ner_tags': ner_tags})

    return pd.DataFrame(sentences)

# Read the files into pandas DataFrames
df_train = read_ner_file('train.tsv')
df_devel = read_ner_file('devel.tsv')
df_test = read_ner_file('test.tsv')

# Convert the DataFrames into a Hugging Face DatasetDict
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'validation': Dataset.from_pandas(df_devel),
    'test': Dataset.from_pandas(df_test)
})

print(f"\n--- Step 2: Data loaded into 'datasets' format ---")
print(raw_datasets)
print("\nExample from training set:")
print(raw_datasets['train'][0])


--- Step 2: Data loaded into 'datasets' format ---
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4581
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4797
    })
})

Example from training set:
{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.'], 'ner_tags': ['B-Chemical', 'O', 'O', 'B-Disease', 'I-Disease', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [3]:
# STEP 3: DEFINE MODEL, TOKENIZER & LABEL MAPPINGS
# -----------------------------------------------------------------
model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"

# We get the label list from our .tsv files
label_list = ['O', 'B-Chemical', 'I-Chemical', 'B-Disease', 'I-Disease']
num_labels = len(label_list)

# Create the label <-> ID mappings
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

# Load the BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print(f"\n--- Step 3: Model, Tokenizer, and Labels defined ---")
print(f"Model Checkpoint: {model_checkpoint}")
print(f"Label to ID Map: {label_to_id}")


--- Step 3: Model, Tokenizer, and Labels defined ---
Model Checkpoint: dmis-lab/biobert-base-cased-v1.1
Label to ID Map: {'O': 0, 'B-Chemical': 1, 'I-Chemical': 2, 'B-Disease': 3, 'I-Disease': 4}


In [4]:
# -----------------------------------------------------------------
# STEP 4: PREPROCESSING & TOKENIZATION (THE "GOTCHA")
# -----------------------------------------------------------------
# This is the most important function. It tokenizes words into sub-words
# and aligns the labels correctly (using the -100 "ignore" index).

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens like [CLS] and [SEP] get -100
            if word_idx is None:
                label_ids.append(-100)
            # If it's the first sub-word of a new word...
            elif word_idx != previous_word_idx:
                # ...get the original label
                label_ids.append(label_to_id[label[word_idx]])
            else:
                # ...otherwise (it's a subsequent sub-word), set to -100
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply this function to all splits of our dataset
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

print(f"\n--- Step 4: Preprocessing complete ---")
print("Example of tokenized and aligned labels:")
print("Original Tokens:", raw_datasets['train'][0]['tokens'])
print("New Tokens:", tokenizer.convert_ids_to_tokens(tokenized_datasets['train'][0]['input_ids']))
print("New Labels:", tokenized_datasets['train'][0]['labels'])

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4581 [00:00<?, ? examples/s]

Map:   0%|          | 0/4797 [00:00<?, ? examples/s]


--- Step 4: Preprocessing complete ---
Example of tokenized and aligned labels:
Original Tokens: ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.']
New Tokens: ['[CLS]', 'se', '##leg', '##ili', '##ne', '-', 'induced', 'post', '##ural', 'h', '##y', '##pot', '##ens', '##ion', 'in', 'park', '##ins', '##on', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.', '[SEP]']
New Labels: [-100, 1, -100, -100, -100, 0, 0, 3, -100, 4, -100, -100, -100, -100, 0, 3, -100, -100, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [6]:
# FORCE UPGRADE CELL
# Run this cell once to make sure you have the latest libraries
!pip install --upgrade transformers datasets seqeval

print("\n--- Libraries have been upgraded! ---")
print("Please RE-RUN the cell for STEP 5 below.")


--- Libraries have been upgraded! ---
Please RE-RUN the cell for STEP 5 below.


In [13]:
# -----------------------------------------------------------------
# STEP 5: MODEL FINE-TUNING (Final Fix: Disabling wandb)
# -----------------------------------------------------------------
# This block adds 'report_to="none"' to disable the wandb API key prompt.

# 1. Load the pre-trained BioBERT model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

# 2. Define Training Arguments
model_name = "biobert-ner-bc5cdr"

# --- THE FIX (Calculating steps) ---
batch_size = 8
train_dataset_size = len(tokenized_datasets["train"])
steps_per_epoch = (train_dataset_size // batch_size) + (1 if train_dataset_size % batch_size > 0 else 0)

print(f"--- Applying Final Code Fix ---")
print(f"Calculated steps per epoch: {steps_per_epoch}")
print("Disabling 'wandb' reporting.")
# --- END OF FIX ---


training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,

    do_train=True,
    do_eval=True,

    eval_steps = steps_per_epoch,
    save_steps = steps_per_epoch,

    load_best_model_at_end=False,

    logging_steps=100,
    fp16=True,

    # --- HERE IS THE NEW LINE ---
    # This tells the Trainer NOT to log to Weights & Biases
    report_to="none",
    # --- END OF NEW LINE ---
)

# 3. Define the Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# 4. Define our Evaluation Metric (F1-Score)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    f1 = seqeval.metrics.f1_score(true_labels, true_predictions, average="macro")

    return {
        "f1": f1,
    }

# 5. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print(f"\n--- Step 5: Trainer initialized. Starting training... ---")
print("This will take 10-20 minutes. Please wait.")

# 6. TRAIN THE MODEL!
trainer.train()

print(f"\n--- Training Complete! ---")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Applying Final Code Fix ---
Calculated steps per epoch: 570
Disabling 'wandb' reporting.

--- Step 5: Trainer initialized. Starting training... ---
This will take 10-20 minutes. Please wait.


Step,Training Loss
100,0.3938
200,0.1216
300,0.0935
400,0.0821
500,0.0889
600,0.0625
700,0.0455
800,0.044
900,0.0433
1000,0.0439



--- Training Complete! ---


In [10]:
pip freeze cur.txt

absl-py==1.4.0
absolufy-imports==0.3.1
accelerate==1.11.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.1
aiosignal==1.4.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.2
alembic==1.17.0
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.11.0
anywidget==0.9.18
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
array_record==0.8.1
arrow==1.4.0
arviz==0.22.0
astropy==7.1.1
astropy-iers-data==0.2025.10.20.0.39.8
astunparse==1.6.3
atpublic==5.1
attrs==25.4.0
audioread==3.0.1
Authlib==1.6.5
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
beartype==0.22.2
beautifulsoup4==4.13.5
betterproto==2.0.0b6
bigframes==2.26.0
bigquery-magics==0.10.3
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.1.0
blosc2==3.10.2
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.2
Brotli==1.1.0
build==1.3.0
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.10.5
cffi==2.0.0
chardet==5.2.0
charset-normalizer==3.4.4
chex==0.1.90
cla

In [14]:
# STEP 6: EVALUATION
# -----------------------------------------------------------------
# This step runs the trained model against the unseen 'test.tsv'
# file to see how well it performs in the real world.
# The 'eval_f1' score here is the most important number in your project.

print(f"\n--- Step 6: Evaluating on the test set... ---")
print("This runs your trained model against the unseen 'test.tsv' file.")

# The 'trainer.evaluate()' function will use our fine-tuned model
# on the test dataset.
test_results = trainer.evaluate(tokenized_datasets["test"])

print(f"\n--- Test Set Evaluation Results ---")
print(f"  Test F1-Score: {test_results['eval_f1']:.4f}")
print(f"  Test Loss: {test_results['eval_loss']:.4f}")

print("\nThis F1-Score is the key metric for your project.")
print("A score over 0.85 is considered state-of-the-art!")


--- Step 6: Evaluating on the test set... ---
This runs your trained model against the unseen 'test.tsv' file.



--- Test Set Evaluation Results ---
  Test F1-Score: 0.8723
  Test Loss: 0.0990

This F1-Score is the key metric for your project.
A score over 0.85 is considered state-of-the-art!


In [15]:
# STEP 7: INFERENCE (HOW TO USE THE MODEL)
# -----------------------------------------------------------------
# This is the demo part. We'll use the 'pipeline' function
# to make it easy to test our model on new sentences.
from transformers import pipeline

print(f"\n--- Step 7: Inference Demo (Using the Model) ---")

# 'trainer.model' refers to the model we just finished training
# 'device=0' tells it to use the GPU for fast predictions
ner_pipeline = pipeline(
    "ner",
    model=trainer.model,
    tokenizer=tokenizer,
    device=0
)

# --- Try any sentence you want here! ---
text = "The patient was prescribed Selegiline for Parkinson's disease, but developed severe hypotension."

print(f"\nInput Text: {text}")

# Run the text through our trained pipeline
results = ner_pipeline(text)

# The default pipeline output is messy (it splits sub-words).
# This helper function cleans it up and groups the entities.
def post_process_results(results):
    entities = []
    current_entity = None

    for res in results:
        entity_tag = res['entity'].split('-')[-1] # Get 'Chemical' or 'Disease'

        if res['entity'].startswith('B-'):
            # If we are starting a new entity, save the old one
            if current_entity:
                entities.append(current_entity)
            current_entity = {
                "entity": entity_tag,
                "word": res['word'],
                "score": res['score']
            }
        elif res['entity'].startswith('I-') and current_entity:
            # This is an "Inside" token, so we append it
            if res['word'].startswith('##'):
                # Append a sub-word (e.g., '##tension' to 'hypo')
                current_entity['word'] += res['word'].replace('##', '')
            else:
                # Append a new word (e.g., '2' to 'type')
                current_entity['word'] += ' ' + res['word']
        else:
            # This is an 'O' token, so we save the previous entity (if any)
            if current_entity:
                entities.append(current_entity)
            current_entity = None

    # Add the very last entity
    if current_entity:
        entities.append(current_entity)

    return entities

# Process and print the clean results
clean_results = post_process_results(results)
print(f"\nClean Extracted Entities:")
for entity in clean_results:
    print(f"  - Entity: {entity['entity']}")
    print(f"    Word: {entity['word']}")
    print(f"    Score: {entity['score']:.4f}")

Device set to use cuda:0



--- Step 7: Inference Demo (Using the Model) ---

Input Text: The patient was prescribed Selegiline for Parkinson's disease, but developed severe hypotension.

Clean Extracted Entities:
  - Entity: Chemical
    Word: selegiline
    Score: 0.9992
  - Entity: Disease
    Word: parkinson ' s disease
    Score: 0.9980
  - Entity: Disease
    Word: hypotension
    Score: 0.9990


In [16]:
# -----------------------------------------------------------------
# STEP 8: CREATE INTERACTIVE WEB DEMO WITH GRADIO
# -----------------------------------------------------------------
!pip install gradio -q

import gradio as gr
from transformers import pipeline

print("\n--- Step 8: Building the Gradio Demo ---")

# 1. Load your pipeline (it's already in memory, but we redefine it for this cell)
ner_pipeline = pipeline(
    "ner",
    model=trainer.model,
    tokenizer=tokenizer,
    device=0
)

# 2. Define the helper function (must be in the same cell)
def post_process_results(results):
    entities = []
    current_entity = None
    for res in results:
        entity_tag = res['entity'].split('-')[-1]
        if res['entity'].startswith('B-'):
            if current_entity: entities.append(current_entity)
            current_entity = {"entity": entity_tag, "word": res['word'], "score": res['score']}
        elif res['entity'].startswith('I-') and current_entity:
            if res['word'].startswith('##'):
                current_entity['word'] += res['word'].replace('##', '')
            else:
                current_entity['word'] += ' ' + res['word']
        else:
            if current_entity: entities.append(current_entity)
            current_entity = None
    if current_entity: entities.append(current_entity)
    return entities

# 3. Define the main "predict" function that Gradio will call
def analyze_text(text):
    """
    This function takes raw text, runs the model,
    and returns a formatted HTML string for the demo.
    """
    # Run the model
    raw_results = ner_pipeline(text)
    # Clean up the results
    entities = post_process_results(raw_results)

    # Format the output for Gradio
    if not entities:
        return "(No entities found)"

    # We'll return a simple list of strings
    output_list = []
    for entity in entities:
        score_percent = f"{(entity['score'] * 100):.2f}%"
        output_list.append(
            f"Entity: {entity['entity']} | Word: {entity['word']} | Confidence: {score_percent}"
        )
    return "\n".join(output_list)

# 4. Launch the Gradio Interface
iface = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(lines=5, label="Enter Clinical Text", placeholder="e.g., 'Paracetamol ingestion led to hepatic failure...'"),
    outputs=gr.Textbox(label="Extracted Entities"),
    title="CREA: Clinical Record Entity Analyzer (BioBERT)",
    description="This is a demo of a BioBERT model fine-tuned on the BC5CDR dataset to extract Chemical and Disease entities. (F1-Score: 0.8723)",
    article="Enter a sentence and click 'Submit'.",
    allow_flagging="never"
)

# This will create a local link AND a public link (e.g., "Running on public URL: ...")
# The public link is shareable!
iface.launch(debug=True, share=True)

Device set to use cuda:0



--- Step 8: Building the Gradio Demo ---
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3c44b23647dc01037e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3c44b23647dc01037e.gradio.live




In [17]:
# -----------------------------------------------------------------
# STEP 9: SAVE THE MODEL & TOKENIZER PERMANENTLY
# -----------------------------------------------------------------

# Define the directory name where the model is saved
model_directory = "biobert-ner-bc5cdr"

# 1. Save the tokenizer
# This will save 'vocab.txt', 'tokenizer_config.json', etc.
# inside the same directory as your model.
tokenizer.save_pretrained(model_directory)
print(f"Tokenizer files saved to '{model_directory}'")

Tokenizer files saved to 'biobert-ner-bc5cdr'


In [18]:
# 2. Zip the entire model directory
zip_filename = "my_ner_model.zip"
!zip -r {zip_filename} {model_directory}

print(f"Model and tokenizer zipped into '{zip_filename}'")

  adding: biobert-ner-bc5cdr/ (stored 0%)
  adding: biobert-ner-bc5cdr/checkpoint-570/ (stored 0%)
  adding: biobert-ner-bc5cdr/checkpoint-570/special_tokens_map.json (deflated 42%)
  adding: biobert-ner-bc5cdr/checkpoint-570/vocab.txt (deflated 49%)
  adding: biobert-ner-bc5cdr/checkpoint-570/model.safetensors (deflated 7%)
  adding: biobert-ner-bc5cdr/checkpoint-570/trainer_state.json (deflated 64%)
  adding: biobert-ner-bc5cdr/checkpoint-570/scheduler.pt (deflated 61%)
  adding: biobert-ner-bc5cdr/checkpoint-570/tokenizer.json (deflated 70%)
  adding: biobert-ner-bc5cdr/checkpoint-570/rng_state.pth (deflated 26%)
  adding: biobert-ner-bc5cdr/checkpoint-570/optimizer.pt (deflated 23%)
  adding: biobert-ner-bc5cdr/checkpoint-570/tokenizer_config.json (deflated 74%)
  adding: biobert-ner-bc5cdr/checkpoint-570/scaler.pt (deflated 64%)
  adding: biobert-ner-bc5cdr/checkpoint-570/training_args.bin (deflated 53%)
  adding: biobert-ner-bc5cdr/checkpoint-570/config.json (deflated 53%)
  addi

In [20]:
from google.colab import drive
import os

print("Mounting Google Drive...")
# 1. This will open a pop-up to ask for your permission.
#    You will need to click a link, sign in, and paste an authorization code.
drive.mount('/content/drive')

print("\nDrive mounted successfully!")

# Define the source and destination paths
source_file = "my_ner_model.zip"
destination_path = f"/content/drive/MyDrive/{source_file}"

# 2. Copy the file to your Google Drive's main "My Drive" folder
if os.path.exists(source_file):
    print(f"Copying '{source_file}' to your Google Drive...")
    !cp {source_file} {destination_path}
    print(f"\n✅ Successfully copied to: {destination_path}")

    # 3. Verify the file is in your Drive
    print("\nVerifying file in Google Drive:")
    !ls -lh {destination_path}
else:
    print(f"ERROR: The file '{source_file}' was not found. Please make sure you ran the zip command first.")

Mounting Google Drive...
Mounted at /content/drive

Drive mounted successfully!
Copying 'my_ner_model.zip' to your Google Drive...

✅ Successfully copied to: /content/drive/MyDrive/my_ner_model.zip

Verifying file in Google Drive:
-rw------- 1 root root 3.0G Oct 22 20:24 /content/drive/MyDrive/my_ner_model.zip


In [22]:
import os
from google.colab import drive
from transformers import pipeline
import glob # We'll use this to find the correct folder

# -----------------------------------------------------------------
# STEP 1: INSTALL LIBRARIES & MOUNT DRIVE
# -----------------------------------------------------------------
!pip install transformers -q

if not os.path.exists('/content/drive/MyDrive'):
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

# -----------------------------------------------------------------
# STEP 2: DEFINE PATHS & UNZIP YOUR MODEL
# -----------------------------------------------------------------
zip_file_path = "/content/drive/MyDrive/my_ner_model.zip"
unzip_base_dir = "biobert-ner-bc5cdr"

!rm -rf {unzip_base_dir} # Clean up old versions
!unzip -q {zip_file_path} # Unzip the model
print(f"✅ Model unzipped to '{unzip_base_dir}'")

# --- START OF FIX ---
# Find the correct path to the model
# The model is likely in the last checkpoint folder
try:
    # List all checkpoint folders and get the one with the highest number
    checkpoint_folders = glob.glob(f"{unzip_base_dir}/checkpoint-*")
    # Get the last one by sorting them numerically
    last_checkpoint = sorted(checkpoint_folders, key=lambda x: int(x.split('-')[-1]))[-1]

    local_model_path = last_checkpoint
    print(f"Found model checkpoint at: {local_model_path}")

except IndexError:
    # If no checkpoint folders, the model *might* be in the base.
    # But based on your error, it's in a checkpoint.
    print(f"WARNING: No checkpoint folder found. Trying base directory '{unzip_base_dir}'...")
    local_model_path = unzip_base_dir
# --- END OF FIX ---


# -----------------------------------------------------------------
# STEP 3: LOAD PIPELINE & RUN INFERENCE
# -----------------------------------------------------------------

# Helper function to clean the output
def post_process_results(results):
    entities = []
    current_entity = None
    for res in results:
        entity_tag = res['entity'].split('-')[-1]
        if res['entity'].startswith('B-'):
            if current_entity: entities.append(current_entity)
            current_entity = {"entity": entity_tag, "word": res['word'], "score": res['score']}
        elif res['entity'].startswith('I-') and current_entity:
            if res['word'].startswith('##'):
                current_entity['word'] += res['word'].replace('##', '')
            else:
                current_entity['word'] += ' ' + res['word']
        else:
            if current_entity: entities.append(current_entity)
            current_entity = None
    if current_entity: entities.append(current_entity)
    return entities

# Check if the model directory exists before loading
if os.path.exists(local_model_path):
    print(f"\nLoading model from corrected path: '{local_model_path}'...")

    # Load the pipeline from the CORRECT subfolder
    ner_pipeline = pipeline(
        "ner",
        model=local_model_path,
        tokenizer=local_model_path,
        device=0 # Use 0 for GPU, -1 for CPU
    )

    print("🚀 Pipeline loaded successfully!")

    # --- Run Inference ---
    text = "A case of metabolic acidosis and acute renal failure following paracetamol ingestion is presented."

    print(f"\nInput Text: {text}")
    raw_results = ner_pipeline(text)
    clean_results = post_process_results(raw_results)

    print(f"\nClean Extracted Entities:")
    if not clean_results:
        print("  (No entities found)")
    for entity in clean_results:
        print(f"  - Entity: {entity['entity']}")
        print(f"    Word: {entity['word']}")
        print(f"    Score: {entity['score']:.4f}")

else:
    print(f"ERROR: Model directory '{local_model_path}' not found.")
    print("Unzipping or path finding may have failed.")

Google Drive is already mounted.
✅ Model unzipped to 'biobert-ner-bc5cdr'
Found model checkpoint at: biobert-ner-bc5cdr/checkpoint-1710

Loading model from corrected path: 'biobert-ner-bc5cdr/checkpoint-1710'...


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🚀 Pipeline loaded successfully!

Input Text: A case of metabolic acidosis and acute renal failure following paracetamol ingestion is presented.

Clean Extracted Entities:
  - Entity: Disease
    Word: metabolic acidosis
    Score: 0.9960
  - Entity: Disease
    Word: acute renal failure
    Score: 0.9977
  - Entity: Chemical
    Word: paracetamol
    Score: 0.9994
