In [None]:
!pip download transformers==4.41.0 
!pip download tokenizers==0.11.0
!pip download torch==2.4.0

In [2]:
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

tokenizers.__version__: 0.21.1
transformers.__version__: 4.51.3


In [3]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup
from datasets import Dataset
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# 資料讀取與處理
df = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')
texts = df['pn_history'].dropna().unique().tolist()
dataset = Dataset.from_dict({'text': texts})

model_path = "/kaggle/input/deberta-v3-base/deberta-v3-base/"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True, collate_fn=data_collator)

# 訓練設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
gradient_accumulation_steps = 4
total_steps = len(train_dataloader) // gradient_accumulation_steps * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

scaler = torch.cuda.amp.GradScaler()
model.train()

global_step = 0
for epoch in range(num_epochs):
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(loop):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            global_step += 1

        loop.set_postfix(loss=loss.item())

# 儲存模型
model.save_pretrained("/kaggle/working/taptdb", safe_serialization=False)
tokenizer.save_pretrained("/kaggle/working/taptdb", legacy_format=False)


2025-06-01 07:06:26.683913: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748761586.847880      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748761586.895987      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base/ and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-

Map (num_proc=4):   0%|          | 0/42146 [00:00<?, ? examples/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 10537/10537 [34:50<00:00,  5.04it/s, loss=0.485]
Epoch 2: 100%|██████████| 10537/10537 [34:48<00:00,  5.05it/s, loss=0.221]
Epoch 3: 100%|██████████| 10537/10537 [34:46<00:00,  5.05it/s, loss=0.188]


('/kaggle/working/tadtdb/tokenizer_config.json',
 '/kaggle/working/tadtdb/special_tokens_map.json',
 '/kaggle/working/tadtdb/tokenizer.json')