In [1]:
! pip install evaluate -qq
! pip install datasets -qq
! pip install bert_score -qq
! pip install rouge-score -qq
! pip install --upgrade nltk
! pip install korouge_score




In [2]:
import gc

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import json
import os
import zipfile
from typing import List, Dict

def flatten_json_data(data_list):
    return pd.json_normalize(
        data_list,
        sep='_',
        record_path=None,
        meta=[
            'category',
            'job_field',
            'gender',
            'experience',
            ['version'],
            ['dataSet', 'answer', 'raw', 'text'],
            ['dataSet', 'answer', 'summary', 'text'],
            ['dataSet', 'question', 'raw', 'text'],
            ['dataSet', 'info', 'ageRange'],
            ['dataSet', 'info', 'channel'],
            ['dataSet', 'info', 'date'],
            ['dataSet', 'info', 'experience'],
            ['dataSet', 'info', 'gender'],
            ['dataSet', 'info', 'occupation'],
            ['dataSet', 'info', 'place'],
            ['rawDataInfo', 'answer', 'audioPath'],
            ['rawDataInfo', 'question', 'audioPath']
        ]
    )

def download_and_extract(urls: List[str], base_path: str) -> None:
    os.makedirs(base_path, exist_ok=True)

    for i, url in enumerate(urls):
        zip_file_name = url.split("/")[-1]
        zip_file_path = os.path.join("/kaggle/working", zip_file_name)
        print(f'처리중... {i+1}: {zip_file_name}')

        !wget -q {url} -O {zip_file_path}

        subfolder_name = zip_file_name.replace('.zip', '')
        subfolder_path = os.path.join(base_path, subfolder_name)
        os.makedirs(subfolder_path, exist_ok=True)

        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(subfolder_path)

        os.remove(zip_file_path)

def create_dataset_from_json_files(base_path: str) -> Dataset:
    all_data = []

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        folder_name = os.path.basename(os.path.dirname(file_path))
                        parts = folder_name.split('_')
                        data.update({
                            'category': parts[0],  # TL or VL
                            'job_field': '_'.join(parts[1:-2]),
                            'gender': parts[-2],
                            'experience': parts[-1]
                        })
                        all_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error reading {file_path}: {e}")

    df = flatten_json_data(all_data)

    return Dataset.from_pandas(df)

def create_interview_dataset():
    train_urls = [
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_01.Management_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_01.Management_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_01.Management_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_01.Management_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_02.SalesMarketing_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_02.SalesMarketing_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_02.SalesMarketing_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_02.SalesMarketing_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_03.PublicService_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_03.PublicService_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_03.PublicService_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_03.PublicService_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_04.RND_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_04.RND_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_04.RND_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_04.RND_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_05.ICT_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_05.ICT_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_05.ICT_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_05.ICT_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_06.Design_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_06.Design_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_06.Design_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_06.Design_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_07.ProductionManufacturing_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_07.ProductionManufacturing_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_07.ProductionManufacturing_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Training/TL_07.ProductionManufacturing_Male_New.zip"
    ]

    valid_urls = [
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_01.Management_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_01.Management_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_01.Management_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_01.Management_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_02.SalesMarketing_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_02.SalesMarketing_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_02.SalesMarketing_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_02.SalesMarketing_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_03.PublicService_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_03.PublicService_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_03.PublicService_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_03.PublicService_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_04.RND_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_04.RND_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_04.RND_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_04.RND_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_05.ICT_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_05.ICT_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_05.ICT_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_05.ICT_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_06.Design_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_06.Design_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_06.Design_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_06.Design_Male_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_07.ProductionManufacturing_Female_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_07.ProductionManufacturing_Female_New.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_07.ProductionManufacturing_Male_Experienced.zip",
        "https://github.com/1000century/multi_lion/raw/main/interview/Validation/VL_07.ProductionManufacturing_Male_New.zip"
    ]

    train_path = '/kaggle/working/train_unzipped'
    valid_path = '/kaggle/working/validation_unzipped'

    download_and_extract(train_urls, train_path)
    download_and_extract(valid_urls, valid_path)

    train_dataset = create_dataset_from_json_files(train_path)
    valid_dataset = create_dataset_from_json_files(valid_path)

    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': valid_dataset
    })

    return dataset_dict

In [4]:
# 데이터셋 생성
dataset = create_interview_dataset()

# 데이터셋 정보 출력
print("\nDataset Info:")
print(dataset)

# 훈련 데이터 샘플 확인
print("\nTraining Data Sample:")
print(dataset['train'][0])

# 검증 데이터 샘플 확인
print("\nValidation Data Sample:")
print(dataset['validation'][0])

# 데이터셋을 로컬이나 HuggingFace Hub에 저장하기
# dataset.save_to_disk("path/to/local/directory")  # 로컬에 저장
# dataset.push_to_hub("username/dataset-name")     # HuggingFace Hub에 업로드

처리중... 1: TL_01.Management_Female_Experienced.zip
처리중... 2: TL_01.Management_Female_New.zip
처리중... 3: TL_01.Management_Male_Experienced.zip
처리중... 4: TL_01.Management_Male_New.zip
처리중... 5: TL_02.SalesMarketing_Female_Experienced.zip
처리중... 6: TL_02.SalesMarketing_Female_New.zip
처리중... 7: TL_02.SalesMarketing_Male_Experienced.zip
처리중... 8: TL_02.SalesMarketing_Male_New.zip
처리중... 9: TL_03.PublicService_Female_Experienced.zip
처리중... 10: TL_03.PublicService_Female_New.zip
처리중... 11: TL_03.PublicService_Male_Experienced.zip
처리중... 12: TL_03.PublicService_Male_New.zip
처리중... 13: TL_04.RND_Female_Experienced.zip
처리중... 14: TL_04.RND_Female_New.zip
처리중... 15: TL_04.RND_Male_Experienced.zip
처리중... 16: TL_04.RND_Male_New.zip
처리중... 17: TL_05.ICT_Female_Experienced.zip
처리중... 18: TL_05.ICT_Female_New.zip
처리중... 19: TL_05.ICT_Male_Experienced.zip
처리중... 20: TL_05.ICT_Male_New.zip
처리중... 21: TL_06.Design_Female_Experienced.zip
처리중... 22: TL_06.Design_Female_New.zip
처리중... 23: TL_06.Design_Male_Ex

In [5]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import os

from transformers import PreTrainedTokenizerFast
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from transformers import Trainer
import evaluate
import numpy as np
import torch
from korouge_score import rouge_scorer

from datasets import load_dataset
import numpy as np

In [6]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [7]:
dataset['train'][0]

{'version': '1.0',
 'category': 'TL',
 'job_field': '02.SalesMarketing',
 'gender': 'Male',
 'experience': 'New',
 'dataSet_info_date': '20230116',
 'dataSet_info_occupation': 'SM',
 'dataSet_info_channel': 'MOCK',
 'dataSet_info_place': 'ONLINE',
 'dataSet_info_gender': 'MALE',
 'dataSet_info_ageRange': '-34',
 'dataSet_info_experience': 'NEW',
 'dataSet_question_raw_text': '지원원님께서 직장을 선택할 때 중요하게 여기시는 세 가지가 있다면 각각 무엇인지 말씀해 주세요',
 'dataSet_question_raw_wordCount': 13,
 'dataSet_question_emotion': [],
 'dataSet_question_intent': [],
 'dataSet_answer_raw_text': '네 직장을 선택할 때 제가 가장 중요하게 여기는 요소가 있다면 어 첫 번째는 바로 이제 월급인데요. 저는 월급을 좀 제 때 지급해 주는 회사가 이제 정말 좋은 회사라고 저는 생각을 합니다. 왜냐하면 이제 월급은 직장인에게 있어서 회사가 줄 수 있는 가장 최고의 복지이자 또 자산인데요. 어 저는 보상인데요. 저는 따라서 이거 이렇게 이걸 기본적으로 사 지급해주는 회사가 당연히 좋을 수 밖에 없는 회사라고 저는 일단 생각을 합니다. 그리고 두 번째는 바로 워라벨입니다. 저는 이렇게 월급이 아무리 세더라도 마 밤 열 두 시까지 일한다거나 하는 그런 폭발적인 어 업무가 있다면 저는 솔직히 말씀드려서 그 업무를 좀 꺼리게 될 것 같습니다. 아무리 돈을 많이 준대두요. 따라서 저 같은 경우는 이제 이런 이런 워라벨 또 저는 중심으로 또 많이 보고 있구요. 마지막으로는 이제 직

In [8]:
def preprocess_examples(examples, is_train=True):
    max_length = 256
    questions = [f'</s> 질문: {q} 응답: ' for q in examples['dataSet_question_raw_text']]
    answers = [f'{a} </s>' for a in examples['dataSet_answer_raw_text']]

    if is_train:
        full_sequences = [f'{q} {a}' for q, a in zip(questions, answers)]
        model_inputs = tokenizer(
            full_sequences,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        # 학습 시에도 질문 부분은 -100으로 마스킹
        model_inputs['labels'] = model_inputs['input_ids'].clone()  # labels 설정
        for idx, q in enumerate(questions):
            # 질문 부분의 토큰 길이 계산
            question_tokens = tokenizer(q, add_special_tokens=False)
            question_length = len(question_tokens['input_ids'])
            # 질문 부분을 -100으로 마스킹
            model_inputs['labels'][idx, :question_length] = -100
    else:
        model_inputs = tokenizer(
            questions,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        labels = tokenizer(
            answers,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )['input_ids']
        model_inputs['labels'] = labels

    model_inputs['labels'][model_inputs['labels'] == tokenizer.pad_token_id] = -100

    # Convert tensors to lists
    for key in model_inputs:
        model_inputs[key] = model_inputs[key].tolist()
    return model_inputs

# 적용
train_dataset = dataset['train'].map(
    lambda x: preprocess_examples(x, is_train=True),
    batched=True,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False
)

val_dataset = dataset['validation'].map(
    lambda x: preprocess_examples(x, is_train=False),
    batched=True,
    remove_columns=dataset['validation'].column_names,
    load_from_cache_file=False
)

Map:   0%|          | 0/68074 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

In [9]:
for i in val_dataset[0].keys():
    print('토큰개수',i, len(val_dataset[0][i]), '개')
print('train input_id, label 예시')
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(tokenizer.decode(train_dataset[0]['input_ids']))
print('valid input_id, label 예시')
print(tokenizer.decode(val_dataset[0]['input_ids']))
print(tokenizer.decode([token for token in val_dataset[0]['labels'] if token != -100], skip_special_tokens=True))

토큰개수 input_ids 256 개
토큰개수 token_type_ids 256 개
토큰개수 attention_mask 256 개
토큰개수 labels 256 개
train input_id, label 예시
</s> 질문: 지원원님께서 직장을 선택할 때 중요하게 여기시는 세 가지가 있다면 각각 무엇인지 말씀해 주세요 응답:  네 직장을 선택할 때 제가 가장 중요하게 여기는 요소가 있다면 어 첫 번째는 바로 이제 월급인데요. 저는 월급을 좀 제 때 지급해 주는 회사가 이제 정말 좋은 회사라고 저는 생각을 합니다. 왜냐하면 이제 월급은 직장인에게 있어서 회사가 줄 수 있는 가장 최고의 복지이자 또 자산인데요. 어 저는 보상인데요. 저는 따라서 이거 이렇게 이걸 기본적으로 사 지급해주는 회사가 당연히 좋을 수 밖에 없는 회사라고 저는 일단 생각을 합니다. 그리고 두 번째는 바로 워라벨입니다. 저는 이렇게 월급이 아무리 세더라도 마 밤 열 두 시까지 일한다거나 하는 그런 폭발적인 어 업무가 있다면 저는 솔직히 말씀드려서 그 업무를 좀 꺼리게 될 것 같습니다. 아무리 돈을 많이 준대두요. 따라서 저 같은 경우는 이제 이런 이런 워라벨 또 저는 중심으로 또 많이 보고 있구요. 마지막으로는 이제 직장 동료입니다. 아무리 이제 두 개가 좋다고 하더라도 같이 일하는 동료 가서 썼던 말로 그지 같으면 제대로 업무가 하고 싶은 마음이 날까요. 아니요 저는 안 날 것 같습니다. 따라서 제가 방금 말씀드린 이제 월급 워라벨 직장 동료 이렇게 세 가지가 좋지 않으면은 저는 솔직
</s> 질문: 지원원님께서 직장을 선택할 때 중요하게 여기시는 세 가지가 있다면 각각 무엇인지 말씀해 주세요 응답:  네 직장을 선택할 때 제가 가장 중요하게 여기는 요소가 있다면 어 첫 번째는 바로 이제 월급인데요. 저는 월급을 좀 제 때 지급해 주는 회사가 이제 정말 좋은 회사라고 저는 생각을 합니다. 왜냐하면 이제 월급은 직장인에게 있어서 회사가 줄 수 있는 가장 최고의 복지이자 또 자산인데요. 어 저는

In [10]:
print(len(train_dataset))
# 코드 잘 돌아가는지 확인 위해 train_dataset 임의로 매우 적은 개수로 샘플링
# from torch.utils.data import Subset
# train_dataset = Subset(train_dataset, indices=range(4000))
# val_dataset = Subset(val_dataset, indices=range(800))
print(len(train_dataset))
print(len(val_dataset))

68074
68074
8026


# Model

In [11]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [12]:
import psutil
import os

def log_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Current memory usage: {process.memory_info().rss / 1e6} MB")


In [13]:

meteor = evaluate.load('meteor')
bertscore = evaluate.load('bertscore')
bleu = evaluate.load('bleu')

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Predictions 튜플 처리
    print("\n", '='*50,"\nPredictions type:", type(predictions))
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    # Predictions와 Labels를 numpy 배열로 변환
    if isinstance(predictions, torch.Tensor):
        print("Converting Predictions to numpy...")
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        print("Converting Labels to numpy...")
        labels = labels.cpu().numpy()

    # Decode predictions and labels
    print("Decoding predictions and labels...")
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    print("Decoded Predictions Sample:\n", decoded_preds[:3])
    print("Decoded Labels Sample:\n", decoded_labels[:3])

    # Calculate ROUGE metrics using rouge-score
    print("Calculating ROUGE...")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    for pred, label in zip(decoded_preds, decoded_labels):
        if pred and label:  # 둘 다 비어있지 않은 경우만 계산
            scores = scorer.score(label, pred)
            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    print("\nROUGE scores details:")
    print(f"Number of valid pairs: {len(rouge_scores['rouge1'])}")
    if rouge_scores['rouge1']:
        print(f"Sample ROUGE-1 scores(raw): {rouge_scores['rouge1'][:5]}")

    rouge_result = {
        'rouge1': np.mean(rouge_scores['rouge1']) * 100 if rouge_scores['rouge1'] else 0,
        'rouge2': np.mean(rouge_scores['rouge2']) * 100 if rouge_scores['rouge2'] else 0,
        'rougeL': np.mean(rouge_scores['rougeL']) * 100 if rouge_scores['rougeL'] else 0
    }

    # Calculate other metrics
    print("Calculating METEOR...")
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    print("METEOR result:", meteor_result)

    print("Calculating BERTScore...")
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="ko", device='cuda' if torch.cuda.is_available() else 'cpu')
    print("BERTScore finished")

    print("Calculating BLEU...")
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    print("BLEU result:", bleu_result)

    # Combine results
    result = {
        **rouge_result,
        "meteor": round(meteor_result["meteor"] * 100, 4),
        "bertscore_f1": round(np.mean(bertscore_result["f1"]) * 100, 4),
        "bleu": round(bleu_result["bleu"] * 100, 4)
    }

    print("Calculating generated length...")
    result['gen_len'] = np.mean([len(pred.split()) for pred in decoded_preds])
    print("Final Metrics:", result)

    return result

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
from transformers import Trainer, TrainingArguments

# Training arguments 설정
training_args = TrainingArguments(
    output_dir='./results',          # 모델이 저장될 경로
    evaluation_strategy="epoch",    # 매 에포크마다 평가
    save_strategy="epoch",          # 매 에포크마다 저장
    learning_rate=5e-5,             # 기본 학습률
    lr_scheduler_type="linear",     # 선형적으로 감소
    per_device_train_batch_size=32,  # 각 디바이스에서의 배치 사이즈
    per_device_eval_batch_size=32,   # 평가 시의 배치 사이즈
    eval_accumulation_steps=8,      # 평가 시 gradient accumulation steps
    num_train_epochs=6,             # 에포크 수
    weight_decay=0.01,              # 가중치 감쇠
    save_total_limit=2,             # 저장할 체크포인트 개수 제한
    logging_dir='./logs',           # 로깅 파일 저장 위치
    logging_steps=500,              # 로깅 빈도
    load_best_model_at_end=True,    # 가장 좋은 모델을 끝에서 로드
    eval_steps=1000,                # 평가 빈도
    fp16=True,                      # Mixed Precision (FP16) 사용
    dataloader_num_workers=4,       # DataLoader에서 사용할 워커 수
    deepspeed=None,                 # DeepSpeed는 사용하지 않음
    report_to="none",               # 로깅 서비스 (TensorBoard 등 사용하지 않음)
)



In [15]:
from transformers import Trainer
import numpy as np

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
)

# DataLoader 확인 및 메모리 상태 출력
print(f"Total train batches: {len(trainer.get_train_dataloader())}")
print(f"Total eval batches: {len(trainer.get_eval_dataloader())}")
print(f"Batch size train: {training_args.per_device_train_batch_size}")
print(f"Batch size eval: {training_args.per_device_eval_batch_size}")
print(f"Eval accumulation steps: {training_args.eval_accumulation_steps}")

Total train batches: 2128
Total eval batches: 251
Batch size train: 32
Batch size eval: 32
Eval accumulation steps: 8


  trainer = Trainer(


In [16]:
import warnings
import os

# fork 관련 경고 제거
warnings.filterwarnings('ignore', message="os.fork()")

# tokenizer 경고 제거
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# autocast 경고 제거
warnings.filterwarnings('ignore', message="`torch.cuda.amp.autocast")

# gather 경고 제거
warnings.filterwarnings('ignore', message="Was asked to gather along dimension 0")

In [17]:
# 학습 시작
trainer.train()

# 모델 가중치와 설정 저장
trainer.save_model("./results/best_model")
trainer.save_state()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Meteor,Bertscore F1,Bleu,Gen Len
1,3.4722,10.12552,1.872581,0.153014,1.462914,1.8302,53.5305,0.0906,238.54934
2,3.3009,10.298685,1.700267,0.152188,1.337577,1.6154,52.2682,0.0849,242.634936
3,3.1723,10.469122,1.953752,0.163799,1.460614,1.8661,53.0832,0.0966,241.974583
4,3.0735,10.519403,1.966553,0.168639,1.47949,1.8038,52.5605,0.1002,241.830551
5,2.9978,10.698956,1.978536,0.173258,1.479264,1.7911,52.8001,0.1014,242.604161
6,2.9352,10.818208,2.011715,0.177997,1.505052,1.8172,52.9126,0.1024,242.52629



Predictions type: <class 'tuple'>
Decoding predictions and labels...
Decoded Predictions Sample:
 ['등해어 제가 하면서하다 보면은 제가 가지 일이들이 때문에 문제가 발생할 수 있는데 것데요. 저 때 저 스스로 해결 대처 수 있는 방법이 해결 방법이 있다면 생각합니다.요. 라고시 있을 해결할 있는 문제 방법이이나 해결 있다면 어기를해 주세요 라고해업무 업무 업무를 어 업무를 어 업무를 어 업무를 업무를 업무를 업무를 업무를 어 어 업무를 업무를 어 어 어 어 어 어 어 어 어 어 어 저 어 어 어 업무를 저 어 어 업무를 어 어 어 어 어 어 어 어 저 저 어 어 어 어 저 어 어 어 어 어 어 어 어 저 어 어 저 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 저 어 어 어 저 어 어 어 저 어 어 어 어 어 어 저 어 어 어 어 어 어 어 어 어 어 저 어 어 어 어 저 어 어 어 어 어 어 어 어 어 어 어 어 저 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 어 저 어 어 어 저 저 어 어 어 어 어 어 어 어 어 어 저 어 어 업무를 어 어 어 어 어 어 어 어 어 어 어 저 저 업무를 어 저 어 저 업무를 업무를 저 어 어 업무를 업무를 어 저', '등해어 취할다 때 주로 휴식을 하냐까요 왜해 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을 휴식을

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTScore finished
Calculating BLEU...
BLEU result: {'bleu': 0.0009059760229306067, 'precisions': [0.020540805499885212, 0.0016921273244034979, 0.0002825930993059263, 6.858904481116022e-05], 'brevity_penalty': 1.0, 'length_ratio': 2.207341277767189, 'translation_length': 1934004, 'reference_length': 876169}
Calculating generated length...
Final Metrics: {'rouge1': 1.87258102834216, 'rouge2': 0.15301407449683924, 'rougeL': 1.4629135784905254, 'meteor': 1.8302, 'bertscore_f1': 53.5305, 'bleu': 0.0906, 'gen_len': 238.54933964615}

Predictions type: <class 'tuple'>
Decoding predictions and labels...
Decoded Predictions Sample:
 ['과 주어 제가 진행하다 보면은 저 가지 일이들이 때문에 어 발생할 수 있는데 것데요. 저 때 저 스스로 대처 대처 수 있는 해결 해결 해결 있는지 생각합니다.요. 저시 저 생각 있는 해결책이이라면 노 있 어기해 주세요. 라고해업무 업무 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 저 

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [18]:
import torch

# GPU 상태 초기화 및 남은 메모리 확인
for i in range(torch.cuda.device_count()):
    device = torch.device(f"cuda:{i}")

    # 메모리 초기화
    torch.cuda.empty_cache()  # GPU 캐시 비우기
    torch.cuda.ipc_collect()  # GPU 메모리에서 누수된 객체 수집

    # 메모리 통계 초기화
    torch.cuda.reset_max_memory_allocated(device)
    torch.cuda.reset_peak_memory_stats(device)

    # GPU 메모리 상태 확인
    total_memory = torch.cuda.get_device_properties(device).total_memory  # 전체 메모리
    reserved_memory = torch.cuda.memory_reserved(device)  # 예약된 메모리
    allocated_memory = torch.cuda.memory_allocated(device)  # 사용 중인 메모리
    free_memory = reserved_memory - allocated_memory  # 예약된 메모리 중 사용 가능 메모리

    print(f"GPU {i} 메모리 리셋 완료!")
    print(f"  전체 메모리: {total_memory / 1024 ** 2:.2f} MB")
    print(f"  예약된 메모리: {reserved_memory / 1024 ** 2:.2f} MB")
    print(f"  사용 중인 메모리: {allocated_memory / 1024 ** 2:.2f} MB")
    print(f"  사용 가능 메모리: {free_memory / 1024 ** 2:.2f} MB")


GPU 0 메모리 리셋 완료!
  전체 메모리: 22699.88 MB
  예약된 메모리: 3130.00 MB
  사용 중인 메모리: 2068.25 MB
  사용 가능 메모리: 1061.75 MB




In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Best Model 로드
best_model = AutoModelForCausalLM.from_pretrained("./results/best_model")

# GPU 설정
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 입력 프롬프트
prompt ="본인의 강점이 무엇이라고 생각하시나요"  # 텍스트 생성 시작 문장
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# 텍스트 생성
output = model.generate(
    input_ids,
    max_length=250,            # 생성할 최대 길이
    num_beams=5,               # 빔 서치 크기
    no_repeat_ngram_size=2,    # 반복 방지
    early_stopping=True,       # 적절히 멈춤
    temperature=0.7            # 생성 다양성 제어 (0.7~1.0 추천)
)

# 결과 디코딩
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)



Generated Text:
본인의 강점이 무엇이라고 생각하시나요 라고 질문하셨습니다. 그 질문에 대한 저의 답변을 말씀드리겠습니다. 저는 성격이 내성적이고 내성적인 편입니다. 그렇기 때문에 다른 사람들과 어울릴 때 어려움을 많이 겪었습니다. 그래서 제가 가진 장점을 살리기 위해 많은 노력을 해 왔는데요. 제가 가지고 있는 단점은 성격이 급하다는 것입니다. 그래서 다른 사람과 어울리기 위해서 노력을 많이 했었는데요. 하지만 이러한 단점을 극복하기 위해서 다른 사람에게 먼저 다가가서 친절하게 대해야 한다고 생각했습니다. 제가 먼저 다가가는 것이 중요하다고 생각했기 때문입니다. 


In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Best Model 로드
best_model = AutoModelForCausalLM.from_pretrained("./results/best_model")

# GPU 설정
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 입력 프롬프트
prompt ="저희 회사가 당신을 뽑아야 하는 이유를 설명하세요"  # 텍스트 생성 시작 문장
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# 텍스트 생성
output = model.generate(
    input_ids,
    max_length=250,             # 생성할 최대 길이
    num_beams=5,               # 빔 서치 크기
    no_repeat_ngram_size=2,    # 반복 방지
    early_stopping=True,       # 적절히 멈춤
    temperature=0.7            # 생성 다양성 제어 (0.7~1.0 추천)
)

# 결과 디코딩
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)

Generated Text:
저희 회사가 당신을 뽑아야 하는 이유를 설명하세요 라고 이렇게 질문해 주셨는데요. 어 일단은 제가 이 회사에 입사를 하게 된다면 어 저는 이 회사가 저를 뽑아주신 이유에 대해서 말씀드리겠습니다. 어 제가 지원한 이 직무가 저에게 굉장히 잘 맞을 거라고 생각을 했고요. 그렇기 때문에 이 회사에서 저의 역량을 펼칠 수 있는 기회를 주신 거라 생각을 하고요. 그리고 이 회사에서도 제가 잘 할 수 있을 것이라고 생각을 했기 때문에 지원을 하게 되었습니다. 


In [21]:
trainer

<transformers.trainer.Trainer at 0x7d5be3787160>

In [22]:
# import pandas as pd

# def text_to_df(text):
#     # 문자열을 StringIO로 변환하여 read_csv로 읽기
#     from io import StringIO
#     df = pd.read_csv(StringIO(text), sep='\t', index_col='Epoch')
#     return df

# # 사용 예시
# text = """Epoch	Training Loss	Validation Loss	Rouge1	Rouge2	Rougel	Meteor	Bertscore F1	Bleu	Gen Len
# 1	3.044300	3.222267	3.894383	0.100244	3.891268	11.421300	68.278400	2.294900	150.748692
# 2	2.880200	3.163328	3.893261	0.103941	3.890146	11.646900	67.931000	2.762400	173.994019
# 3	2.751900	3.132268	3.925590	0.114037	3.922475	11.819300	68.100800	2.672200	165.322577
# 4	2.652100	3.126122	3.895874	0.098253	3.892759	11.927000	68.208600	2.841900	170.993023
# 5	2.570200	3.119663	3.907963	0.105179	3.904848	12.031600	68.393200	2.761000	164.119362
# 6	2.504900	3.122530	3.908684	0.110336	3.905569	12.076300	68.326300	2.802400	167.528781
# """

# history_df = text_to_df(text)
# print(history_df)

In [23]:
# import matplotlib.pyplot as plt
# import numpy as np

# # nan 값을 제외하고 데이터 추출
# train_loss = history_df['Training Loss'].dropna().values
# eval_loss = history_df['Validation Loss'].dropna().values
# rouge1 = history_df['Rouge1'].dropna().values
# rouge2 = history_df['Rouge2'].dropna().values
# rougeL = history_df['Rougel'].dropna().values
# meteor = history_df['Meteor'].dropna().values
# bertscore = history_df['Bertscore F1'].dropna().values
# bleu = history_df['Bleu'].dropna().values
# gen_len = history_df['Gen Len'].dropna().values

# # 실제 데이터 개수에 맞춰 epochs 배열 생성
# epochs = np.arange(len(eval_loss))

# # 그래프 생성
# fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
# fig.suptitle('Training Metrics Overview', fontsize=16, y=1.02)

# # Loss plot
# ax1.plot(epochs, train_loss, 'b-', label='Training Loss', marker='o')
# ax1.plot(epochs, eval_loss, 'r-', label='Validation Loss', marker='o')
# ax1.set_title('Training and Validation Loss')
# ax1.set_xlabel('Epoch')
# ax1.set_ylabel('Loss')
# ax1.legend()
# ax1.grid(True, linestyle='--', alpha=0.7)

# # ROUGE scores over time
# ax2.plot(epochs, rouge1, 'b-', label='ROUGE-1', marker='o')
# ax2.plot(epochs, rouge2, 'r-', label='ROUGE-2', marker='o')
# ax2.plot(epochs, rougeL, 'g-', label='ROUGE-L', marker='o')
# ax2.set_title('ROUGE Scores')
# ax2.set_xlabel('Epoch')
# ax2.set_ylabel('Score')
# ax2.legend()
# ax2.grid(True, linestyle='--', alpha=0.7)

# # Other metrics
# ax3.plot(epochs, meteor, 'b-', label='METEOR', marker='o')
# ax3.plot(epochs, bertscore, 'r-', label='BERTScore F1', marker='o')
# ax3.plot(epochs, bleu, 'g-', label='BLEU', marker='o')
# ax3.set_title('Other Metrics')
# ax3.set_xlabel('Epoch')
# ax3.set_ylabel('Score')
# ax3.legend()
# ax3.grid(True, linestyle='--', alpha=0.7)

# # Generation length
# ax4.plot(epochs, gen_len, 'b-', label='Generation Length', marker='o')
# ax4.set_title('Generation Length')
# ax4.set_xlabel('Epoch')
# ax4.set_ylabel('Length')
# ax4.legend()
# ax4.grid(True, linestyle='--', alpha=0.7)

# plt.tight_layout()

# # 그래프 저장
# plt.savefig('training_metrics.png', dpi=300, bbox_inches='tight')
# plt.show()

# # 최종 결과값들 출력
# print("\nFinal Metrics:")
# print(f"Train Loss: {train_loss[-1]:.4f}")
# print(f"Validation Loss: {eval_loss[-1]:.4f}")
# print(f"ROUGE-1: {rouge1[-1]:.4f}")
# print(f"ROUGE-2: {rouge2[-1]:.4f}")
# print(f"ROUGE-L: {rougeL[-1]:.4f}")
# print(f"METEOR: {meteor[-1]:.4f}")
# print(f"BERTScore: {bertscore[-1]:.4f}")
# print(f"BLEU: {bleu[-1]:.4f}")
# print(f"Generation Length: {gen_len[-1]:.4f}")