In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import warnings
warnings.filterwarnings('ignore')

# 필수 라이브러리 임포트
import os
import torch
import numpy as np
from datetime import datetime

# Transformers 라이브러리
from transformers import (
    pipeline,                              # 고수준 API - 가장 쉬운 방법
    AutoTokenizer,                         # 자동 토크나이저
    AutoModelForQuestionAnswering,         # QA 모델 자동 로더
    DistilBertTokenizerFast,              # DistilBERT 고속 토크나이저
    DistilBertForQuestionAnswering,        # DistilBERT QA 모델
    ElectraTokenizer,                      # ELECTRA 토크나이저 (한글)
    ElectraForQuestionAnswering,           # ELECTRA QA 모델 (한글)
    DefaultDataCollator,                   # 기본 데이터 콜레이터
    TrainingArguments,                     # 학습 하이퍼파라미터
    Trainer,                               # 범용 트레이너
)
from datasets import load_dataset




In [3]:
question_answer = pipeline("question-answering",model = 'distilbert-base-cased-distilled-squad')

context = """Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics,
is the process of deriving high-quality information from text. It involves
"the discovery by computer of new, previously unknown information,
by automatically extracting information from different written resources."
Written resources may include websites, books, emails, reviews, and articles.
High-quality information is typically obtained by devising patterns and trends
by means such as statistical pattern learning. According to Hotho et al. (2005)
we can distinguish between three different perspectives of text mining:
information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process."""

question1 = "What is text mining?"
question2 = "What are the perspectives of text mining?"

# 질의 응답 수행
answer1 = question_answer(context=context, question=question1)
answer2 = question_answer(context=context, question=question2)
if answer1['score'] < 0.1:
  print(f'answer1 : 답변 없음')
else:
  print(f"answer1 : {answer1['answer']}")
if answer2['score'] < 0.1:
  print(f'answer2 : 답변 없음')
else:
  print(f"answer2 : {answer2['answer']}")

# AutoModel
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

inputs = tokenizer(question1, context, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
start_score = outputs.start_logits
end_score  = outputs.end_logits
answer_start = torch.argmax(start_score)
answer_end = torch.argmax(end_score)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end+1]))
print(f"answer1 : {answer}")


Device set to use cpu


answer1 : the process of deriving high-quality information from text
answer2 : 답변 없음
answer1 : the process of deriving high - quality information from text


In [4]:
# SQuAD 데이터셋 로드 분석
# 스탠포드 대학에서 공개한 질의응답 벤치마크 - Extractive QA 표준
squad = load_dataset('squad', split='train[:5000]')
squad = squad.train_test_split(test_size=0.2,seed=42)
squad

Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 467997.44 examples/s]
Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 703342.59 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [5]:
squad['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 4000
})

In [6]:
print(squad['train'][0]['context'][:10])
print(squad['train'][0]['question'][:10])
print(squad['train'][0]['answers']['text'])
print(squad['train'][0]['answers']['answer_start'])
print(squad['train'][0]['context'][98:98 + len('Neo-Confucian establishment')])

With the d
What estab
['Neo-Confucian establishment']
[98]
Neo-Confucian establishment


In [8]:
# Fine turnnig
# 사전학습만 된모델(QA 헤드는 초기화)  distilbert-base-uncased
# 한국어 학습이 가능하지만 성능 보장 못하고 비효율적
# distilbert-base-uncased 영어전용 한국어를 전처리할때 어간 및 품사등이 달라서 심하게 왜곡
# 한국어면 한국어전용 base에 모델에 파인튜닝을 또는 다국어모델에
# mBERT  bert-base-multilingual-cased
# klue/bert-base 등등

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# QA헤드는 아직 학습되지 않음(랜덤 가중치가 적용)
test_context = """The city is the birthplace of many cultural movements, including the Harlem 
Renaissance in literature and visual art; abstract expressionism 
(also known as the New York School) in painting; and hip hop, punk, salsa, disco, 
freestyle, Tin Pan Alley, and Jazz in music. New York City has been considered 
the dance capital of the world. The city is also widely celebrated in popular lore, 
frequently the setting for books, movies, and television programs."""
    
test_question = "The dance capital of the world is what city in the US?"

# Fine-tuning이 되지않은 모델
inputs = tokenizer(test_question, test_context, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
start = torch.argmax(outputs.start_logits)
end = torch.argmax(outputs.end_logits)
input_ids = inputs['input_ids'][0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end+1]))


In [None]:
# Fine-tuning Hugging Face Trainer API를 이용해서 미세조정학습(추가학습)
training_args = TrainingArguments(
        output_dir="./qa_model",                # 모델 저장 경로
        eval_strategy="epoch",                  # 매 에포크마다 평가
        learning_rate=2e-5,                     # 학습률
        per_device_train_batch_size=16,         # 학습 배치 크기
        per_device_eval_batch_size=16,          # 평가 배치 크기
        num_train_epochs=3,                     # 에포크 수
        weight_decay=0.01,                      # 가중치 감쇠
        logging_steps=100,                      # 100스텝마다 로그
        save_strategy="epoch",                  # 에포크마다 저장
        save_total_limit=2,                     # 최근 2개만 유지
        load_best_model_at_end=True,            # 최고 모델 로드
        metric_for_best_model="eval_loss",      # 평가 손실 기준
        fp16=True,                              # Mixed Precision (GPU만)
        push_to_hub=False,                      # Hub 업로드 안 함
        report_to="none",                       # 외부 로깅 비활성화
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=squad['train'],
    eval_dataset=squad['test'],
    tokenizer=tokenizer,
    data_collator=DefaultDataCollator(),
)