<a href="https://colab.research.google.com/github/Arckalyss/Arckalyss/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets accelerate safetensors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/drive/MyDrive/diabetes_clinical_notes.csv /content/

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from torch.optim import AdamW

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix
)


from scipy.stats import ttest_rel, wilcoxon
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

SAVE_ROOT = "/content/drive/MyDrive/bert_experiments"
os.makedirs(SAVE_ROOT, exist_ok=True)

Using device: cuda


In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
import pandas as pd

DATA_PATH = "/content/diabetes_clinical_notes.csv"

df = pd.read_csv(DATA_PATH)

texts = df["TEXT"]
labels = df["label"]

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=max_length
        )
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=max_length
        )
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
def train_model(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def evaluate_model(model, dataloader):
    model.eval()

    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            logits = outputs.logits

            probs = torch.softmax(logits, dim=1)[:, 1]
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    metrics = {
        "accuracy": accuracy_score(all_labels, all_preds),
        "macro_f1": f1_score(all_labels, all_preds, average="macro"),
        "micro_f1": f1_score(all_labels, all_preds, average="micro"),
        "precision": precision_score(all_labels, all_preds),
        "recall": recall_score(all_labels, all_preds),
        "roc_auc": roc_auc_score(all_labels, all_probs),
        "pr_auc": average_precision_score(all_labels, all_probs)
    }

    return metrics, all_labels, all_preds

In [None]:
def cross_validate_model(model_name, texts, labels, n_splits=3, epochs=2, batch_size=8):

    print(f"\n========== {model_name} ==========")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_metrics = []

    safe_model_name = model_name.split("/")[-1]
    model_save_path = os.path.join(SAVE_ROOT, safe_model_name)
    os.makedirs(model_save_path, exist_ok=True)

    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):

        print(f"\n--- Fold {fold+1} ---")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2
        ).to(device)

        train_dataset = TextDataset(texts.iloc[train_idx], labels.iloc[train_idx], tokenizer)
        val_dataset = TextDataset(texts.iloc[val_idx], labels.iloc[val_idx], tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW(model.parameters(), lr=2e-5)

        total_steps = len(train_loader) * epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        for epoch in range(epochs):
            train_loss = train_model(model, train_loader, optimizer, scheduler)
            print(f"Epoch {epoch+1} | Loss: {train_loss:.4f}")

        metrics, y_true, y_pred = evaluate_model(model, val_loader)
        fold_metrics.append(metrics)

        print(metrics)

        # Save model
        fold_path = os.path.join(model_save_path, f"fold_{fold+1}")
        os.makedirs(fold_path, exist_ok=True)

        model.save_pretrained(fold_path, safe_serialization=True)
        tokenizer.save_pretrained(fold_path)

        # Save predictions
        pd.DataFrame({
            "true": y_true,
            "pred": y_pred
        }).to_csv(os.path.join(fold_path, "predictions.csv"), index=False)

        print("Saved to:", fold_path)

        del model
        torch.cuda.empty_cache()

    return fold_metrics

In [None]:
models = [
    "emilyalsentzer/Bio_ClinicalBERT",
    "dmis-lab/biobert-base-cased-v1.1",
    "distilbert-base-uncased"
]

results = {}

for model_name in models:
    fold_results = cross_validate_model(model_name, texts, labels)
    results[model_name] = fold_results



--- Fold 1 ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Conside