In [1]:
!pip install tensorflow datasets transformers

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-20.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.4-cp311-cp311-win_amd64.whl.metadata (7.9 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cac

In [4]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer, pipeline
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


Check CUDA

In [2]:
import torch

print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
else:
    print("CUDA not available — still CPU only")

print(torch.__version__)  # should be 2.6.0 or newer

     

CUDA Available: True
Device: NVIDIA GeForce GTX 1080
2.6.0+cu126


Load QA Dataset

In [5]:
bio_asq = load_dataset("nehal69/bioAsq_Extractive_QA", field ="data", split="train[:3000]")
bio_asq = bio_asq.train_test_split(test_size=0.2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 3266 examples [00:00, 15650.08 examples/s]


In [6]:
bio_asq

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answers', 'context'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['id', 'question', 'answers', 'context'],
        num_rows: 600
    })
})

Convert to Dataframe

In [8]:
import pandas as pd 
# Convert the dataset to a dictionary
data_dict = bio_asq["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,id,question,answers,context
0,5324bdba9b2d7acc7e00001a_003,How is bladder wall thickness measured?,"[{'answer_start': 670, 'text': 'Ultrasound'}]",Ultrasound estimated bladder weight in asympto...
1,52ed795098d0239505000032_037,Is the transcriptional regulator BACH1 an acti...,"[{'answer_start': 112, 'text': 'repressor'}]",Heme regulates gene expression by triggering C...
2,5353aedb288f4dae47000006_015,Which is the transcript responsible for X-chro...,"[{'answer_start': 708, 'text': 'Xist'}]",Histone acetylation controls the inactive X ch...
3,553c9f96f32186855800000c_006,How are ultraconserved elements called when th...,"[{'answer_start': 488, 'text': 'gene regulator...",Genomic context analysis reveals dense interac...
4,55200c606b348bb82c000013_266,Which clotting factor is inhibited by betrixaban?,"[{'answer_start': 37, 'text': 'Xa'}]",Evaluation of the oral direct factor Xa inhibi...


List all questions and answers for inputs

In [9]:
questions = [q.strip() for q in df["question"]]
context = [q.strip() for q in df["context"]]

Initialize Inputs

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

Test Dataset

In [None]:
offset_mapping = inputs.pop("offset_mapping")

start_positions = []
end_positions = []
answers = df['answers'] # Target label
for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer[0]["answer_start"]
    end_char = answer[0]["answer_start"] + len(answer[0]["text"])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1
    
    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

df["start_positions"] = start_positions
df["end_positions"] = end_positions

import pandas as pd
from datasets import Dataset
data = {'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'start_positions':start_positions,
        'end_positions': end_positions,
       }
df = pd.DataFrame(data)
df.to_csv('encoding_train.csv',index=False)
train = Dataset.from_pandas(df)

Initialize Test Dataset

In [19]:
import pandas as pd 
# Convert the dataset to a dictionary
data_dict = bio_asq["test"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)

questions = [q.strip() for q in df["question"]]
context = [q.strip() for q in df["context"]]
inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

offset_mapping = inputs.pop("offset_mapping")

start_positions = []
end_positions = []
answers = df['answers']
for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer[0]["answer_start"]
    end_char = answer[0]["answer_start"] + len(answer[0]["text"])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

df["start_positions"] = start_positions
df["end_positions"] = end_positions

data = {'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'start_positions':start_positions,
        'end_positions': end_positions,
       }
df = pd.DataFrame(data)
df.to_csv('encoding_test.csv',index=False)
test = Dataset.from_pandas(df)

Initialize Model and Training Args (QA Model is used, we shoudl use a encoder-decoder model)

In [14]:
!pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [16]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

model = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-v1.1")

data_collator = DefaultDataCollator()

# Your existing code to load model, data collator, etc.
training_args = TrainingArguments(
    output_dir="qa_model",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Compute Metrics

In [17]:
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids

    # Convert tuples to numpy arrays
    predictions = np.array(predictions)
    labels = np.array(labels)

    # Assuming your model outputs logits and you want to get predictions
    predictions = np.argmax(predictions, axis=2)
    labels = labels

    # Flatten the predictions and labels
    predictions = predictions.flatten()
    labels = labels.flatten()

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Initialize Trainer

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.312181,0.395,0.2833,0.308488,0.267232
2,No log,1.748984,0.4525,0.381322,0.376882,0.347317
3,No log,1.651917,0.479167,0.406126,0.40467,0.373443


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=450, training_loss=2.503788791232639, metrics={'train_runtime': 547.903, 'train_samples_per_second': 13.141, 'train_steps_per_second': 0.821, 'total_flos': 1411002486374400.0, 'train_loss': 2.503788791232639, 'epoch': 3.0})

Evaluate Results

In [23]:
results = trainer.evaluate()
results

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6519168615341187,
 'eval_accuracy': 0.4791666666666667,
 'eval_precision': 0.4061263742274839,
 'eval_recall': 0.40467048536355466,
 'eval_f1': 0.3734429931481171,
 'eval_runtime': 16.3682,
 'eval_samples_per_second': 36.656,
 'eval_steps_per_second': 2.322,
 'epoch': 3.0}