In [None]:
%pip install torch transformers
%pip install transformers datasets torch accelerate peft
%pip install -U bitsandbytes
%pip install newspaper3k
%pip install evaluate


^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import json
from typing import Dict
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 데이터셋 로드
def load_jsonl_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            dataset.append({
                'input': data['input'],
                'output': data['output']
            })
    return dataset

# 데이터셋 로드
raw_dataset = load_jsonl_dataset('r\E:\Desktop\lee\3g\aisoftware\term\문서요약 텍스트\processed_dataset.jsonl')

# 데이터셋을 Hugging Face datasets 형식으로 변환
dataset = Dataset.from_list(raw_dataset)

# 학습/검증 데이터 분리 (90% 학습, 10% 검증)
dataset = dataset.train_test_split(test_size=0.1)

# KoGPT-2 모델과 토크나이저 로드
model_name = "skt/kogpt2-base-v2"  # SKT에서 제공하는 KoGPT-2 모델
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# 데이터 전처리 함수
def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['output']

    # 입력과 출력을 하나의 문자열로 결합 (GPT 계열 모델은 단일 텍스트 입력 기반으로 학습)
    prompt = [f"요약: {inp}\n답변: {tgt}" for inp, tgt in zip(inputs, targets)]
    model_inputs = tokenizer(
        prompt,
        max_length=768,
        truncation=True,
        padding="max_length"
    )

    # GPT 모델에서는 labels도 동일한 input_ids를 사용 (shifted for causal language modeling)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# 데이터셋 전처리
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

# 학습 하이퍼파라미터 설정
training_args = TrainingArguments(
    output_dir='./kogpt2_news_summary_model',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=500,
    save_total_limit=3,
    save_steps=500,
    fp16=True  # GPU 메모리 최적화
)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer
)

# 모델 학습
trainer.train()

# Google Drive에 모델 저장 경로 설정
google_drive_path = '/content/drive/My Drive/aisw/kogpt2_news_summary_model'

# 모델 저장
trainer.save_model(google_drive_path)
tokenizer.save_pretrained(google_drive_path)

# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# 추론 예시
def summarize_text(text):
    prompt = f"요약: {text}\n답변:"
    inputs = tokenizer(
        prompt,
        max_length=768,
        return_tensors='pt',
        truncation=True
    ).to(device)

    output_ids = model.generate(
        inputs['input_ids'],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# 모델 사용 예시
test_article = raw_dataset[0]['input']
generated_summary = summarize_text(test_article)
print("Original Article:", test_article[:500] + "...")
print("Generated Summary:", generated_summary)


Using device: cuda


OSError: [Errno 22] Invalid argument: 'r\\E:\\Desktop\\lee\x03g\x07isoftware\term\\문서요약 텍스트\\processed_dataset.jsonl'