<a href="https://colab.research.google.com/github/hanghae-plus-AI/AI-1-jhtwiz/blob/main/4%EC%A3%BC%EC%B0%A8_%EA%B8%B0%EB%B3%B8%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HuggingFace로 뉴스 기사 분류하기

In [1]:
!pip install transformers datasets evaluate accelerate scikit-learn



In [2]:
import random
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
db = load_dataset("fancyzhx/ag_news")
db

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [4]:
db['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [5]:
db['train'].features['label'].names

['World', 'Sports', 'Business', 'Sci/Tech']

In [6]:
len_classes = len(db['train'].features['label'].names)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(data):
    return tokenizer(data["text"])

db_tokenized = db.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [7]:
db_tokenized['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
db_split = db_tokenized['train'].train_test_split(test_size=0.2)
db_train, db_val = db_split['train'], db_split['test']
db_test = db_tokenized['test']

In [9]:
len(db_train), len(db_val), len(db_test)

(96000, 24000, 7600)

In [10]:
from transformers import BertConfig

config = BertConfig()

config.hidden_size = 128  # BERT layer의 기본 hidden dimension
config.intermediate_size = 256  # FFN layer의 중간 hidden dimension
config.num_hidden_layers = 5  # BERT layer의 개수
config.num_attention_heads = 4  # Multi-head attention에서 사용하는 head 개수
config.num_labels = len_classes  # 마지막에 예측해야 하는 분류 문제의 class 개수

model = AutoModelForSequenceClassification.from_config(config)

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='hf_transformer',  # 모델, log 등을 저장할 directory
    num_train_epochs=10,  # epoch 수
    per_device_train_batch_size=64,  # training data의 batch size
    per_device_eval_batch_size=64,  # validation data의 batch size
    logging_strategy="epoch",  # Epoch가 끝날 때마다 training loss 등을 log하라는 의미
    do_train=True,  # 학습을 진행하겠다는 의미
    do_eval=True,  # 학습 중간에 validation data에 대한 평가를 수행하겠다는 의미
    eval_strategy="epoch",  # 매 epoch가 끝날 때마다 validation data에 대한 평가를 수행한다는 의미
    save_strategy="epoch",  # 매 epoch가 끝날 때마다 모델을 저장하겠다는 의미
    learning_rate=1e-3,  # optimizer에 사용할 learning rate
    load_best_model_at_end=True  # 학습이 끝난 후, validation data에 대한 성능이 가장 좋은 모델을 채택하겠다는 의미
)

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
from transformers import EarlyStoppingCallback


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=db_train,
    eval_dataset=db_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [19]:
trainer.train()

  0%|          | 0/15000 [00:00<?, ?it/s]

{'loss': 1.3869, 'grad_norm': 0.18240629136562347, 'learning_rate': 0.0009000000000000001, 'epoch': 1.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.386410117149353, 'eval_accuracy': 0.252125, 'eval_runtime': 6.4637, 'eval_samples_per_second': 3713.029, 'eval_steps_per_second': 58.016, 'epoch': 1.0}
{'loss': 1.3865, 'grad_norm': 0.08970958739519119, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.3866215944290161, 'eval_accuracy': 0.250125, 'eval_runtime': 6.3834, 'eval_samples_per_second': 3759.766, 'eval_steps_per_second': 58.746, 'epoch': 2.0}
{'loss': 1.3865, 'grad_norm': 0.1003548726439476, 'learning_rate': 0.0007, 'epoch': 3.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.386256456375122, 'eval_accuracy': 0.252125, 'eval_runtime': 6.4448, 'eval_samples_per_second': 3723.91, 'eval_steps_per_second': 58.186, 'epoch': 3.0}
{'loss': 1.3865, 'grad_norm': 0.10675675421953201, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.3863070011138916, 'eval_accuracy': 0.24495833333333333, 'eval_runtime': 6.4814, 'eval_samples_per_second': 3702.905, 'eval_steps_per_second': 57.858, 'epoch': 4.0}
{'loss': 1.3865, 'grad_norm': 0.11762792617082596, 'learning_rate': 0.0005, 'epoch': 5.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.3862969875335693, 'eval_accuracy': 0.252125, 'eval_runtime': 6.4881, 'eval_samples_per_second': 3699.099, 'eval_steps_per_second': 57.798, 'epoch': 5.0}
{'loss': 1.3864, 'grad_norm': 0.11160490661859512, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 1.386339545249939, 'eval_accuracy': 0.24495833333333333, 'eval_runtime': 6.5057, 'eval_samples_per_second': 3689.08, 'eval_steps_per_second': 57.642, 'epoch': 6.0}
{'train_runtime': 334.2082, 'train_samples_per_second': 2872.461, 'train_steps_per_second': 44.882, 'train_loss': 1.386553955078125, 'epoch': 6.0}


TrainOutput(global_step=9000, training_loss=1.386553955078125, metrics={'train_runtime': 334.2082, 'train_samples_per_second': 2872.461, 'train_steps_per_second': 44.882, 'total_flos': 354945644224512.0, 'train_loss': 1.386553955078125, 'epoch': 6.0})

In [17]:
#결과 데이터 확인해보기
trainer.state.log_history

[{'loss': 1.3282,
  'grad_norm': 0.5434260368347168,
  'learning_rate': 0.0009000000000000001,
  'epoch': 1.0,
  'step': 1500},
 {'eval_loss': 1.3871588706970215,
  'eval_accuracy': 0.252125,
  'eval_runtime': 6.2861,
  'eval_samples_per_second': 3817.935,
  'eval_steps_per_second': 59.655,
  'epoch': 1.0,
  'step': 1500},
 {'loss': 1.387,
  'grad_norm': 0.15216679871082306,
  'learning_rate': 0.0008,
  'epoch': 2.0,
  'step': 3000},
 {'eval_loss': 1.3871299028396606,
  'eval_accuracy': 0.24495833333333333,
  'eval_runtime': 6.4908,
  'eval_samples_per_second': 3697.542,
  'eval_steps_per_second': 57.774,
  'epoch': 2.0,
  'step': 3000},
 {'loss': 1.3866,
  'grad_norm': 0.12614478170871735,
  'learning_rate': 0.0007,
  'epoch': 3.0,
  'step': 4500},
 {'eval_loss': 1.3862649202346802,
  'eval_accuracy': 0.250125,
  'eval_runtime': 6.3027,
  'eval_samples_per_second': 3807.903,
  'eval_steps_per_second': 59.498,
  'epoch': 3.0,
  'step': 4500},
 {'loss': 1.3865,
  'grad_norm': 0.11442901

In [18]:
import matplotlib.pyplot as plt

train_logs = trainer.state.log_history
train_acc = [log["eval_accuracy"] for log in train_logs if "eval_accuracy" in log]

plt.plot(train_acc)
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [18]:
#테스트 데이터로 정확도 평가
trainer.evaluate(db_test)

{'eval_loss': 0.2757314443588257,
 'eval_accuracy': 0.9130263157894737,
 'eval_runtime': 6.9431,
 'eval_samples_per_second': 1094.605,
 'eval_steps_per_second': 17.139,
 'epoch': 6.0}

In [19]:
trainer.save_model()

In [22]:
from transformers import pipeline

#주어진 예시로 예측 결과 출력
classifier = pipeline("sentiment-analysis", model="./hf_transformer/", device='cuda')
test_text = "UK charges 8 in terror plot linked to alert in US LONDON, AUGUST 17: Britain charged eight terror suspects on Tuesday with conspiracy to commit murder and said one had plans that could be used in striking US buildings that were the focus of security scares this month."
print(classifier(test_text))

[{'label': 'LABEL_0', 'score': 0.9822717905044556}]
