In [1]:
import os
import torch
import torch.nn as nn
import random
import numpy as np

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7b8771bcfc50>

In [2]:
from requests import get

# download files for sentiment classification
def download(url, filename):
    with open(filename, "wb") as file:
        response = get(url)
        file.write(response.content)

download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", "ratings_train.txt")
download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", "ratings_test.txt")

with open("ratings_train.txt", "r") as file:
    for i in range(5):
        print(file.readline())

with open("ratings_train.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    train_data = [line.split("\t") for line in lines if len(line) > 0]

train_data = train_data[:10000]

with open("ratings_test.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    test_data = [line.split("\t") for line in lines if len(line) > 0]

id	document	label

9976970	아 더빙.. 진짜 짜증나네요 목소리	0

3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1

10265843	너무재밓었다그래서보는것을추천한다	0

9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0



In [3]:
# https://huggingface.co/docs/transformers/model_doc/bert
# https://huggingface.co/beomi/kcbert-base
# KCBERT official NSMC tutorial : https://colab.research.google.com/drive/1dFC0FL-521m7CL_PSd8RLKq67jgTJVhL?usp=sharing#scrollTo=8ewSOpY1Kwg6
plm_name = "beomi/kcbert-base"

In [4]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(plm_name)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokenizer.tokenize("안녕하세요, 반갑습니다.")

['안녕', '##하세요', ',', '반', '##갑', '##습니다', '.']

In [6]:
tokenizer("안녕하세요, 반갑습니다.")

{'input_ids': [2, 19017, 8482, 15, 1483, 4981, 8046, 17, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
## BERT 토크나이저에서 자동으로 Special Token을 추가해준다.
tokenizer("안녕하세요, 반갑습니다.", add_special_tokens=False)

{'input_ids': [19017, 8482, 15, 1483, 4981, 8046, 17], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [8]:
print(tokenizer.decode(tokenizer("안녕하세요, 반갑습니다.")['input_ids']))
print(tokenizer.decode(tokenizer("안녕하세요, 반갑습니다.")['input_ids'], skip_special_tokens=True))

[CLS] 안녕하세요, 반갑습니다. [SEP]
안녕하세요, 반갑습니다.


In [9]:
tokenizer("안녕하세요, 반갑습니다.", return_tensors="pt")

{'input_ids': tensor([[    2, 19017,  8482,    15,  1483,  4981,  8046,    17,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
batch_sentences = ["안녕하세요, 반갑습니다.", "잘가요."]

In [11]:
tokenizer(batch_sentences, padding=True, return_tensors="pt")

{'input_ids': tensor([[    2, 19017,  8482,    15,  1483,  4981,  8046,    17,     3],
        [    2, 23423,  4040,    17,     3,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [12]:
tokenizer("안녕하세요, 반갑습니다.", "이 문장은 이어지는 문장입니다.", return_tensors="pt")

{'input_ids': tensor([[    2, 19017,  8482,    15,  1483,  4981,  8046,    17,     3,  2451,
          1414, 10101, 10704,  8195,  1414,  4099,  8074,    17,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
very_long_sentence = "안녕하세요, 반갑습니다. " * 1000
print(len(tokenizer(very_long_sentence)['input_ids']))

Token indices sequence length is longer than the specified maximum sequence length for this model (7002 > 300). Running this sequence through the model will result in indexing errors


7002


In [14]:
# 너무 긴 Sequence 길이를 자동으로 max_length 만큼 잘라주는 trigger => truncation
print(len(tokenizer(very_long_sentence, truncation=True, max_length=300)['input_ids']))

300


In [15]:
from torch.utils.data import Dataset, DataLoader

# define dataset class
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = int(self.data[index][2])
        text = self.data[index][1]
        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        inputs = {key: inputs[key].squeeze() for key in inputs}

        return inputs|{'label':torch.tensor(label)}

In [16]:
bert_model = AutoModel.from_pretrained(plm_name)

In [17]:
output = bert_model(**tokenizer(batch_sentences, padding=True, return_tensors="pt"))

In [18]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [21]:
print(output.last_hidden_state.shape)
print(output.pooler_output.shape)

# CLS에 대한 embedding을 가져오는 pooler_output

torch.Size([2, 9, 768])
torch.Size([2, 768])


In [22]:
class sentiment_classifier(torch.nn.Module):
    def __init__(self, bert_model):
        super(sentiment_classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = torch.nn.Linear(768, 2)

    def forward(self, input_ids,attention_mask,token_type_ids):
        output = self.bert(input_ids = input_ids,attention_mask = attention_mask,token_type_ids = token_type_ids)
        pooled_output = output.pooler_output
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [23]:
model = sentiment_classifier(bert_model)
batch_input = tokenizer(batch_sentences, padding=True, return_tensors="pt")
print(batch_input)
model(**batch_input)

{'input_ids': tensor([[    2, 19017,  8482,    15,  1483,  4981,  8046,    17,     3],
        [    2, 23423,  4040,    17,     3,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0]])}


tensor([[-0.0820,  0.1499],
        [-0.3179,  0.0340]], grad_fn=<AddmmBackward0>)

In [24]:
train_dataset = SentimentDataset(train_data,tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

for batch in train_loader:
    label = batch.pop('label')
    model(**batch)
    break

In [25]:
import lightning as pl

class SentimentClassifierPL(pl.LightningModule):
    def __init__(self, sentiment_classifier):
        super(SentimentClassifierPL, self).__init__()
        self.model = sentiment_classifier
        self.loss = nn.CrossEntropyLoss()

        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.save_hyperparameters()

    def forward(self, inputs):
        return self.model(inputs)

    def training_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch)
        loss = self.loss(outputs, labels)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch)
        loss = self.loss(outputs, labels)
        self.log("val_loss", loss)
        self.validation_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("val_accuracy", accuracy)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch)
        loss = self.loss(outputs, labels)
        self.log("test_loss", loss)
        self.test_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_test_epoch_end(self):
        outputs = self.test_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_test_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("test_accuracy", accuracy)
        self.test_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-6)
        return optimizer

In [26]:
import wandb
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelSummary

wandb.login()

def check_performance(model,tokenizer, train_data, test_data, wandb_log_name):
    wandb_logger = WandbLogger(project="NLP", name=wandb_log_name, group="Lec07")

    pl_model = SentimentClassifierPL(model)

    train_dataset = SentimentDataset(train_data,tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataset = SentimentDataset(test_data,tokenizer)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_dataset = SentimentDataset(test_data,tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    trainer = pl.Trainer(
        max_epochs=1,
        accelerator="gpu",
        logger=wandb_logger,
        callbacks=[ModelSummary(max_depth=2)],
        precision=16
    )

    trainer.fit(
        model=pl_model,
        train_dataloaders=train_loader,
        val_dataloaders=val_loader
    )

    trainer.test(dataloaders=test_loader)

    wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnoeyhesx[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [27]:
pl_model = SentimentClassifierPL(model)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'sentiment_classifier' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['sentiment_classifier'])`.


In [28]:
check_performance(pl_model, tokenizer, train_data, test_data, "BERT_base")

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                  | Params
------------------------------------------------------
0 | model       | SentimentClassifierPL | 108 M 
1 | model.model | sentiment_classifier  | 108 M 
2 | model.loss  | CrossEntropyLoss      | 0     
3 | loss        | CrossEntropyLoss      | 0     
------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 313/313 [01:59<00:00,  2.61it/s, v_num=33o9, train_loss=0.238]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 313/313 [02:01<00:00,  2.57it/s, v_num=33o9, train_loss=0.238]


Restoring states from the checkpoint path at ./NLP/ezp733o9/checkpoints/epoch=0-step=313.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at ./NLP/ezp733o9/checkpoints/epoch=0-step=313.ckpt
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1563/1563 [01:20<00:00, 19.32it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      avg_test_loss         0.34909066557884216
      test_accuracy         0.8457599878311157
        test_loss           0.3491342067718506
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
avg_test_loss,▁
avg_val_loss,▁
epoch,▁▁▁▁▁▁▁█
test_accuracy,▁
test_loss,▁
train_loss,█▂▁▄▄▃
trainer/global_step,▁▂▄▅▆███
val_accuracy,▁
val_loss,▁

0,1
avg_test_loss,0.34909
avg_val_loss,0.34909
epoch,1.0
test_accuracy,0.84576
test_loss,0.34913
train_loss,0.3625
trainer/global_step,313.0
val_accuracy,0.84576
val_loss,0.34913


In [36]:
from transformers import AutoModelForSequenceClassification

# https://huggingface.co/docs/transformers/model_doc/electra#transformers.ElectraForSequenceClassification
# https://huggingface.co/beomi/KcELECTRA-base
# ELECTRA paper : https://openreview.net/pdf?id=r1xMH1BtvB

# plm_name = "beomi/kcbert-base"
plm_name = "beomi/KcELECTRA-base"

tokenizer = AutoTokenizer.from_pretrained(plm_name)
model = AutoModelForSequenceClassification.from_pretrained(plm_name, num_labels=2)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
tokenizer(batch_sentences, padding=True, return_tensors="pt")

{'input_ids': tensor([[14712,  1918,    16,   680,  2059,   834,    18],
        [  500,  2345,    18,     3,     3,     3,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 0]])}

In [38]:
model(**batch_input)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0031, -0.1221],
        [-0.0669, -0.1017]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [39]:
model(**batch_input, labels=torch.tensor([1,0]))

SequenceClassifierOutput(loss=tensor(0.7168, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0031, -0.1221],
        [-0.0669, -0.1017]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [33]:
import lightning as pl

class SentimentClassifierPL(pl.LightningModule):
    def __init__(self, sentiment_classifier):
        super(SentimentClassifierPL, self).__init__()
        self.model = sentiment_classifier

        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.save_hyperparameters()

    def forward(self, inputs):
        return self.model(inputs)

    def training_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        self.log("val_loss", loss)
        self.validation_step_outputs.append((loss, logits, labels))
        return loss, outputs, labels

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("val_accuracy", accuracy)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        labels = batch.pop('label')
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        self.log("test_loss", loss)
        self.test_step_outputs.append((loss, logits, labels))
        return loss, outputs, labels

    def on_test_epoch_end(self):
        outputs = self.test_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_test_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("test_accuracy", accuracy)
        self.test_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-6)
        return optimizer

In [34]:
pl_model = SentimentClassifierPL(model)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'sentiment_classifier' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['sentiment_classifier'])`.


In [35]:
check_performance(pl_model, tokenizer, train_data, test_data, "KCELECTRA_base")

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                          | Params
--------------------------------------------------------------
0 | model       | SentimentClassifierPL         | 108 M 
1 | model.model | BertForSequenceClassification | 108 M 
--------------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 148.25it/s]

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                            

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 313/313 [02:04<00:00,  2.52it/s, v_num=ggyh]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 313/313 [02:05<00:00,  2.49it/s, v_num=ggyh]


Restoring states from the checkpoint path at ./NLP/9f6mggyh/checkpoints/epoch=0-step=313.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at ./NLP/9f6mggyh/checkpoints/epoch=0-step=313.ckpt
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1563/1563 [01:20<00:00, 19.43it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      avg_test_loss         0.35549911856651306
      test_accuracy         0.8419399857521057
        test_loss           0.3555421233177185
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
avg_test_loss,▁
avg_val_loss,▁
epoch,▁▁▁▁▁▁▁█
test_accuracy,▁
test_loss,▁
train_loss,█▃▅▃▄▁
trainer/global_step,▁▂▄▅▆███
val_accuracy,▁
val_loss,▁

0,1
avg_test_loss,0.3555
avg_val_loss,0.3555
epoch,1.0
test_accuracy,0.84194
test_loss,0.35554
train_loss,0.20336
trainer/global_step,313.0
val_accuracy,0.84194
val_loss,0.35554


In [40]:
# https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2ForSequenceClassification
# https://huggingface.co/openai-community/gpt2
# GPT-2 paper : https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
plm_name = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(plm_name)
model = AutoModelForSequenceClassification.from_pretrained(plm_name, num_labels=2)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
batch_input = tokenizer(batch_sentences, padding=True, return_tensors="pt")
print(batch_input)

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [42]:
# 위 에러를 해결하는 코드
tokenizer.pad_token = tokenizer.eos_token
batch_input = tokenizer(batch_sentences, padding=True, return_tensors="pt")
print(batch_input)

{'input_ids': tensor([[  168,   243,   230,   167,   227,   243, 47991,   246,   168,   226,
           116,   168,   248,   242,    11, 31619,   108,   246,   166,   108,
           239,   168,   232,   113, 46695,   230, 46695,    97,    13],
        [  168,   252,   246,   166,   108,   222,   168,   248,   242,    13,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]])}


In [43]:
model(**batch_input)

AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.

In [45]:
# 위 코드 에러 해결방안
model.config.pad_token_id = model.config.eos_token_id
model(**batch_input)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[ 7.8357, -5.2693],
        [ 8.1264, -4.9857]], grad_fn=<IndexBackward0>), past_key_values=((tensor([[[[-1.6443,  2.2470,  0.8178,  ..., -1.1460, -0.6263,  1.6173],
          [-1.9198,  2.7449,  2.0702,  ..., -0.9316, -1.2618,  2.3956],
          [-1.8486,  2.1526,  1.4614,  ..., -0.0381, -1.1282,  3.4411],
          ...,
          [-3.7432,  1.9648,  0.4874,  ..., -1.9650, -0.4989,  0.7696],
          [-3.2369,  2.7403,  2.4780,  ..., -0.6613, -1.1638,  2.0010],
          [-2.5097,  2.2639,  2.3698,  ..., -0.2306, -2.1063,  1.4775]],

         [[-0.0223, -0.3916, -0.7470,  ...,  0.1744,  1.8223,  0.6671],
          [-0.2276, -1.6538, -2.8879,  ..., -1.4444,  4.5373,  0.8356],
          [-1.8750,  0.8292, -0.7699,  ..., -0.1184,  3.6074,  0.7003],
          ...,
          [-0.5876, -0.3830, -1.4554,  ..., -0.3461,  3.3416,  4.0786],
          [-0.3578, -0.9361, -1.5585,  ...,  2.0954,  4.1857,  3.2990],
          [-0.9587, -0.

In [46]:
pl_model = SentimentClassifierPL(model)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'sentiment_classifier' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['sentiment_classifier'])`.


In [47]:
check_performance(pl_model, tokenizer, train_data, test_data, "OpenAIGPT2")

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                          | Params
--------------------------------------------------------------
0 | model       | SentimentClassifierPL         | 124 M 
1 | model.model | GPT2ForSequenceClassification | 124 M 
--------------------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.765   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 313/313 [02:54<00:00,  1.80it/s, v_num=zaos]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 313/313 [02:56<00:00,  1.78it/s, v_num=zaos]


Restoring states from the checkpoint path at ./NLP/mvtkzaos/checkpoints/epoch=0-step=313.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at ./NLP/mvtkzaos/checkpoints/epoch=0-step=313.ckpt
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1563/1563 [01:45<00:00, 14.79it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      avg_test_loss         0.6979955434799194
      test_accuracy         0.5857399702072144
        test_loss           0.6980095505714417
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
avg_test_loss,▁
avg_val_loss,▁
epoch,▁▁▁▁▁▁▁█
test_accuracy,▁
test_loss,▁
train_loss,█▁▂▂▁▂
trainer/global_step,▁▂▄▅▆███
val_accuracy,▁
val_loss,▁

0,1
avg_test_loss,0.698
avg_val_loss,0.698
epoch,1.0
test_accuracy,0.58574
test_loss,0.69801
train_loss,0.74659
trainer/global_step,313.0
val_accuracy,0.58574
val_loss,0.69801
