In [2]:
pip install transformers datasets scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd


df = pd.read_csv("alz_disease_pairs_cleaned.csv")

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["relation_label"])

# Force keys to be Python ints
label2id = {str(label): int(idx) for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
id2label = {int(idx): str(label) for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}


In [5]:
print(df["relation_label"].value_counts())


relation_label
ambiguous          89
associated_with    63
equivalent         41
unrelated           8
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

# First, split into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["relation_label"])

# Then split temp into dev (10%) and test (20%) — so dev is 1/3 of temp, test is 2/3
dev_df, test_df = train_test_split(temp_df, test_size=2/3, random_state=42, stratify=temp_df["relation_label"])


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Convert to Hugging Face datasets
train_ds = Dataset.from_pandas(train_df[["input_text", "label_id"]])
dev_ds = Dataset.from_pandas(dev_df[["input_text", "label_id"]])
test_ds = Dataset.from_pandas(test_df[["input_text", "label_id"]])

In [8]:
# Load BioBERT tokenizer and add special entity tokens
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
special_tokens = {"additional_special_tokens": ["[E1]", "[/E1]", "[E2]", "[/E2]"]}
tokenizer.add_special_tokens(special_tokens)

4

In [18]:
train_ds = Dataset.from_pandas(train_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))
dev_ds   = Dataset.from_pandas(dev_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))
test_ds  = Dataset.from_pandas(test_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))

In [20]:
def tokenize(batch):
    return tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
dev_ds   = dev_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)


Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

## Load Model (BioBERT)
- Uses BioBERT with special entity markers

- Trains for 3 epochs with stratified splits

- Tracks accuracy, precision, recall, and F1

- Evaluates final model on a held-out test set

In [16]:
# Load BioBERT model for classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))  # Resize for new special tokens

# Define training arguments
training_args = TrainingArguments(
    output_dir="./alz_relation_biobert_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
import torch
print("CUDA available?", torch.cuda.is_available())
print("Using device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


CUDA available? True
Using device: cuda


In [22]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Final test set evaluation
final_metrics = trainer.evaluate(eval_dataset=test_ds)
print("📊 Final Test Metrics:", final_metrics)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.120053,0.45,0.118421,0.25,0.160714
2,No log,1.09206,0.65,0.35,0.416667,0.369318
3,No log,0.988435,0.65,0.35,0.416667,0.369318


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Final Test Metrics: {'eval_loss': 0.9484491944313049, 'eval_accuracy': 0.6341463414634146, 'eval_precision': 0.31456043956043955, 'eval_recall': 0.40384615384615385, 'eval_f1': 0.3494983277591973, 'eval_runtime': 0.3552, 'eval_samples_per_second': 115.442, 'eval_steps_per_second': 8.447, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
preds = trainer.predict(test_ds)

from sklearn.metrics import classification_report
import numpy as np

y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

target_names = list(label2id.keys())
print(classification_report(y_true, y_pred, target_names=target_names))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                 precision    recall  f1-score   support

      ambiguous       0.64      1.00      0.78        18
associated_with       0.62      0.62      0.62        13
     equivalent       0.00      0.00      0.00         9
      unrelated       0.00      0.00      0.00         1

       accuracy                           0.63        41
      macro avg       0.31      0.40      0.35        41
   weighted avg       0.48      0.63      0.54        41



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# PubMedBERT

In [24]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Step 1: Load and clean dataset
df = pd.read_csv("alz_disease_pairs_cleaned.csv").dropna(subset=["input_text", "relation_label"])

# Step 2: Encode relation labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["relation_label"])

# Step 3: Stratified split into train (70%), dev (10%), test (20%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["relation_label"], random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=2/3, stratify=temp_df["relation_label"], random_state=42)

# Step 4: Convert to Hugging Face Datasets and rename label column to 'labels'
train_ds = Dataset.from_pandas(train_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))
dev_ds = Dataset.from_pandas(dev_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))
test_ds = Dataset.from_pandas(test_df[["input_text", "label_id"]].rename(columns={"label_id": "labels"}))

# Step 5: Load tokenizer and model
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 6: Tokenization function
def tokenize(batch):
    return tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
dev_ds = dev_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# Step 7: Load classification model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_),
    id2label={i: label for i, label in enumerate(label_encoder.classes_)},
    label2id={label: i for i, label in enumerate(label_encoder.classes_)}
)

# Step 8: Define training arguments
training_args = TrainingArguments(
    output_dir="./pubmedbert_relation_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Step 9: Metric computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Step 10: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 11: Train
trainer.train()

# Step 12: Final evaluation on test set
metrics = trainer.evaluate(test_ds)
print("📊 Test Performance:", metrics)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.033479,0.6,0.309524,0.388889,0.34058
2,No log,1.046388,0.55,0.277473,0.361111,0.312937
3,No log,1.106625,0.6,0.53869,0.423611,0.420513


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Test Performance: {'eval_loss': 0.5212944746017456, 'eval_accuracy': 0.8536585365853658, 'eval_precision': 0.6875, 'eval_recall': 0.6282051282051282, 'eval_f1': 0.6434523809523809, 'eval_runtime': 0.3773, 'eval_samples_per_second': 108.664, 'eval_steps_per_second': 7.951, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
import numpy as np
from sklearn.metrics import classification_report

# Step 13: Predict on test set
predictions = trainer.predict(test_ds)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

# Step 14: Per-class metrics
target_names = label_encoder.classes_  # original class names
report = classification_report(y_true, y_pred, target_names=target_names)
print("📊 Per-Class Classification Report:\n")
print(report)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Per-Class Classification Report:

                 precision    recall  f1-score   support

      ambiguous       0.75      1.00      0.86        18
associated_with       1.00      0.85      0.92        13
     equivalent       1.00      0.67      0.80         9
      unrelated       0.00      0.00      0.00         1

       accuracy                           0.85        41
      macro avg       0.69      0.63      0.64        41
   weighted avg       0.87      0.85      0.84        41



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
