In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from transformers import  AutoTokenizer,  AutoModelForSequenceClassification
import datasets

In [2]:
# 데이터 다운로드
# 라벨은 pos:1  neg:0
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
reviews = [movie_reviews.raw(id) for id in movie_reviews.fileids()]
categoris = [ movie_reviews.categories(id)[0] for id in movie_reviews.fileids() ]
labels = [  1 if label == 'pos' else 0 for label in categoris   ]

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [3]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels \
  = train_test_split(reviews, labels, stratify=labels, test_size=0.2, random_state=42)

In [4]:
# 토크나이져
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
# 훈련/테스트 데이터 토근화
train_encodings = tokenizer(train_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors='pt',max_length=512)
train_encodings['input_ids'].shape,  test_encodings['input_ids'].shape


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(torch.Size([1600, 512]), torch.Size([400, 512]))

In [5]:
# torch dataset 구성
class MovieReviewDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
  def __len__(self):
    return len(self.labels)
  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
train_dataset =   MovieReviewDataset(train_encodings, train_labels)
test_dataset =  MovieReviewDataset(test_encodings,test_labels)
print(f'훈련 샘플수 : {len(train_dataset)}')
print(f'테스트 샘플수 : {len(test_dataset)}')

훈련 샘플수 : 1600
테스트 샘플수 : 400


In [6]:
next(iter(train_dataset)).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [7]:
# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)
print(f'파라메터수 : {sum( p.numel() for p in model.parameters() )}')
print(f'학습 가능한 파라메터 : {sum( p.numel() for p in model.parameters() if p.requires_grad)} ')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


파라메터수 : 109483778
학습 가능한 파라메터 : 109483778 


In [8]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [9]:
# 평가 매트릭스
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
from transformers import TrainingArguments, Trainer

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='epoch',
    load_best_model_at_end=True,
    report_to = 'none'  # W&B TensorBoard 자동 로딩 모두 끔
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
print(f'에포크 : {training_args.num_train_epochs}')
print(f'배치크기 : {training_args.per_device_train_batch_size}')
print(f'학습률 : {training_args.learning_rate}')


에포크 : 2
배치크기 : 8
학습률 : 5e-05


In [12]:
# 모델 학습
train_result = trainer.train()
print(f'총 학습시간 : {train_result.metrics["train_runtime"]}')
print(f'최종손실 : {train_result.metrics["train_loss"]}')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5852,0.402166,0.84
2,0.2886,0.438711,0.8725


총 학습시간 : 382.6912
최종손실 : 0.43688119888305665


In [13]:
eval_result = trainer.evaluate()
print(f'테스트 정확도 : {eval_result["eval_accuracy"]}')
# 예측수행
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=-1)
# 분류리포트
print(classification_report(test_labels, pred_labels))

테스트 정확도 : 0.84
              precision    recall  f1-score   support

           0       0.77      0.96      0.86       200
           1       0.95      0.72      0.82       200

    accuracy                           0.84       400
   macro avg       0.86      0.84      0.84       400
weighted avg       0.86      0.84      0.84       400



In [14]:
test_reviews = [
    "This movie is absolutely fantastic! The plot is engaging and the acting is superb.",
    "Terrible film. Waste of time and money. Would not recommend to anyone.",
    "It's an okay movie. Nothing special but not terrible either.",
    "Brilliant masterpiece! One of the best films I've ever seen in my life.",
    "Boring and predictable. I fell asleep halfway through."
]


In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
model.eval()
for i,review in enumerate(test_reviews):
  inputs = tokenizer(review,
                     return_tensors="pt",
                     truncation=True,
                     padding=True,
                     max_length=512
                     )
  inputs   = {k:v.to(device) for k,v in inputs.items()}
  with torch.no_grad():
    outputs = model(**inputs)
    logits =  outputs.logits
    probs = torch.softmax(logits, dim=-1)[0]
    pred_class = torch.argmax(probs).item()
  confidence = probs[pred_class].item()
  print(f'문장 : {review}')
  print(f'예측 : {pred_class}')
  print(f'긍정 : {probs[1].item():.4f}')
  print(f'부정 : {probs[0].item():.4f}\n')

문장 : This movie is absolutely fantastic! The plot is engaging and the acting is superb.
예측 : 1
긍정 : 0.9270
부정 : 0.0730

문장 : Terrible film. Waste of time and money. Would not recommend to anyone.
예측 : 0
긍정 : 0.0638
부정 : 0.9362

문장 : It's an okay movie. Nothing special but not terrible either.
예측 : 0
긍정 : 0.0815
부정 : 0.9185

문장 : Brilliant masterpiece! One of the best films I've ever seen in my life.
예측 : 1
긍정 : 0.9206
부정 : 0.0794

문장 : Boring and predictable. I fell asleep halfway through.
예측 : 0
긍정 : 0.0585
부정 : 0.9415

