In [2]:
import os
import json
from src.data import CustomDataset, OriginalDataset
from transformers import AutoTokenizer

"""
id | 원본 데이터 | 전처리 데이터 | label | 요약문
"""
# dev data result 넣기
JSON_PATH = "results/dev/dev_result_preprocess_yi.json"
DEV_DATA_PATH = "resource/data/일상대화요약_dev.json"
MODEL_ID = "hyeogi/Yi-6b-dpo-v0.2"
DETAIL = "no_ngram"

with open(JSON_PATH, "r") as f:
    result_data = json.load(f)

## utils Config File :  config_yi.json


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
dataset = OriginalDataset(DEV_DATA_PATH, tokenizer)
dataset_preprocessed = CustomDataset(DEV_DATA_PATH, tokenizer)

In [None]:
tokenizer.decode(dataset[0], skip_special_tokens=True)

In [None]:
tokenizer.decode(dataset_preprocessed[0], skip_special_tokens=True)

In [None]:
import evaluate
rouge = evaluate.load('rouge')
bert_score = evaluate.load('bertscore')
bleurt = evaluate.load('bleurt', 'bleurt-large-512', module_type="metric")


def compute_metrics(label, pred):
    # Simple postprocessing
    pred, label = postprocess_text(pred, label)

    rouge_scores = rouge.compute(predictions=[pred], references=[label], rouge_types=["rouge1"])
    # rouge_scores = rouge.get_scores(predictions, labels, avg=True)
    bertScore = bert_score.compute(predictions=[pred], references=[label], lang="ko")['f1'][0]
    bleurtScore = bleurt.compute(predictions=[pred], references=[label])['scores'][0]

    rouge1 = rouge_scores['rouge1']

    # print(bertScore, bleurtScore, rouge1)
    total = (bertScore + bleurtScore + rouge1) / 3

    return {"total" : round(total, 4), "rouge1" : round(rouge1, 4), "BERTScore" : round(bertScore, 4), "BLEURT": round(bleurtScore, 4)}

def postprocess_text(pred, label):
    return pred.strip(), label.strip()

In [None]:
result_data[0]['inference']

In [None]:
result_data[0]['output']

In [None]:
compute_metrics(result_data[0]['inference'], result_data[0]['output'])

In [None]:
import pandas as pd
import tqdm

result_df = pd.DataFrame(columns=['id', 'original', 'preprocessed', 'inference', 'label', 'total', 'rouge1', 'BERTScore', 'BLEURT'])

for idx in tqdm.tqdm(range(len(result_data))):
    metrics = compute_metrics(result_data[idx]['inference'], result_data[idx]['output'])
    row = [result_data[idx]['id'],
           tokenizer.decode(dataset[idx], skip_special_tokens=True), 
           tokenizer.decode(dataset_preprocessed[idx], skip_special_tokens=True),
           result_data[idx]['inference'],
           result_data[idx]['output'],
           metrics['total'],
           metrics['rouge1'],
           metrics['BERTScore'],
           metrics['BLEURT']]
    result_df.loc[len(result_df)] = row

In [None]:
result_df

In [None]:
result_df.sort_values(by='total')

In [None]:
from datetime import datetime
now = datetime.now()
result_df.to_csv(os.path.join("results/dev/", f"{MODEL_ID.split('/')[1]}_{DETAIL}_preprocessed_time_{now.strftime('%Y-%m-%d_%H:%M')}.csv"), index=False)