<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install transformers datasets evaluate scikit-learn
!pip install -U transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.6-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.3
    Uninstalling transformers-4.57.3:
      Successfully uninstalled transformers-4.57.3
Successfully installed transformers-4.57.6


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import random
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
import torch.nn as nn

from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# data = []
# 여기만 "실행 가능"하도록 예시 데이터를 채움
pos_templates = [
    "I love this movie.", "This is fantastic!", "What a great experience.",
    "Absolutely wonderful.", "I am very happy with this."
]
neg_templates = [
    "I hate this movie.", "This is terrible.", "What a bad experience.",
    "Absolutely awful.", "I am very disappointed with this."
]

data = []
for _ in range(120):
    data.append((random.choice(pos_templates), 1))
    data.append((random.choice(neg_templates), 0))

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Train/Dev/Test split
dataset = TextDataset(data, tokenizer, max_length=128)
train_size = int(0.6 * len(dataset))
dev_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - dev_size
train_dataset, dev_dataset, test_dataset = random_split(dataset, [train_size, dev_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
dev_loader   = DataLoader(dev_dataset, batch_size=4)
test_loader  = DataLoader(test_dataset, batch_size=4)

len(train_dataset), len(dev_dataset), len(test_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(144, 48, 48)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[0][:, 0]     # [CLS] 토큰 representation
        logits = self.fc(cls_output)
        return logits

num_classes = 2
model = BertClassifier(num_classes=num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def train(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss


def test(model, data_loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())

    acc = accuracy_score(all_labels, all_preds)
    return acc


lr = 2e-5
epochs = 3
optimizer = AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

best_acc = float("-inf")
best_state = None

for epoch in range(1, epochs + 1):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    print(f"{epoch} epoch Training Loss: {train_loss:.4f}")

    dev_acc = test(model, dev_loader, device)
    print(f"{epoch} epoch, Dev Accuracy: {dev_acc * 100:.2f}%")

    if dev_acc > best_acc:
        best_acc = dev_acc
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

# 테스트
if best_state is not None:
    model.load_state_dict(best_state)

test_acc = test(model, test_loader, device)
print(f"Best Dev Acc: {best_acc*100:.2f}% | Test Accuracy: {test_acc*100:.2f}%")

Training:   0%|          | 0/36 [00:00<?, ?it/s]

1 epoch Training Loss: 0.2101


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

1 epoch, Dev Accuracy: 100.00%


Training:   0%|          | 0/36 [00:00<?, ?it/s]

2 epoch Training Loss: 0.0036


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

2 epoch, Dev Accuracy: 100.00%


Training:   0%|          | 0/36 [00:00<?, ?it/s]

3 epoch Training Loss: 0.0016


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

3 epoch, Dev Accuracy: 100.00%


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Best Dev Acc: 100.00% | Test Accuracy: 100.00%


In [None]:
from transformers import BertForSequenceClassification

class BertClassifier2(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=num_classes
        )

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return out.logits

model2 = BertClassifier2(num_classes=2).to(device)

optimizer2 = AdamW(model2.parameters(), lr=2e-5)
criterion2 = nn.CrossEntropyLoss()

# 1 epoch
train_loss = train(model2, train_loader, optimizer2, criterion2, device)
dev_acc = test(model2, dev_loader, device)
print(f"[BertForSequenceClassification] train_loss={train_loss:.4f}, dev_acc={dev_acc*100:.2f}%")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/36 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

[BertForSequenceClassification] train_loss=0.4746, dev_acc=100.00%


In [None]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
import evaluate

metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    # logits, labels = eval_pred
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=preds, references=labels)

trainer_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_classes
).to(device)

training_args = TrainingArguments(
    output_dir="./bert_cls_out",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.0,
    logging_dir="./bert_cls_logs",
    report_to=[]
)

trainer = Trainer(
    model=trainer_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate(test_dataset)
print("Test Set Evaluation:", eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.283921,1.0
2,No log,0.131041,1.0
3,No log,0.088824,1.0


Test Set Evaluation: {'eval_loss': 0.09282582998275757, 'eval_accuracy': 1.0, 'eval_runtime': 0.3245, 'eval_samples_per_second': 147.914, 'eval_steps_per_second': 18.489, 'epoch': 3.0}
