In [1]:
!pip install 'pandas==2.2.3'



In [2]:
import pandas
import json
from matplotlib import pyplot as plt

In [3]:
# set default valiables
default_environment_variables = {
    "llama-3-2-1b-base": "meta-llama/Llama-3.2-1B",
    "output_eval_file_base": "llama-3-2-1b-basemodel-qa-instruction-eval.json",
    "llama-3-2-1b-lora": "../fine-tuning/output/Llama-3.2-1B_Deepspeed_Zero2_SGBoLcwVAP_202507012312/check_point_output/checkpoint-124803",
    "output_eval_file_lora": "llama-3-2-1b-lora-qa-instruction-eval_1.json",
    # "llama-3-2-1b-alpaca-instruct-version1": "./output/llama-3-2-1b-alpaca-202506241720/save_model_41601",
    # "output_eval_file_alpaca-instruct-version1": "llama-3-2-1b-lora-qa-instruction-eval.json",
    # "llama-3-2-1b-alpaca-instruct-version2":"llama-3-2-1b-alpaca-202506261658/save_model_124803",
    # "output_eval_file_alpaca-instruct-version2": "llama-3-2-1b-alpaca-instruct-version2-evaluation.json",
}

In [4]:
# read json file
def load_alpaca_dataset(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[
    [
      'sample', 
      'dataset',
      'instruction',
      'expected',
      'generated',
      'generator', 
      'retry_count', 
      'bleu_score', 
      'bleu_counts', 
      'bleu_totals', 
      'bleu_precisions', 
      'bleu_bp', 
      'bleu_sys_len', 
      'bleu_ref_len', 
      'bertscore_precision', 
      'bertscore_recall', 
      'bertscore_f1'
     ]
   ]
  dataframe.head()
  return dataframe

In [5]:
# count of dataframe rows
def data_count(dataframe):
  return len(dataframe)

In [6]:
# calculate avg bleu score
def avg_bleu_score(dataframe):
  return dataframe['bleu_score'].mean()

In [7]:
# calculate avg bertscore precision
def avg_bertscore_precision(dataframe):
  return dataframe['bertscore_precision'].mean()

In [8]:
# calculate avg bertscore recall
def avg_bertscore_recall(dataframe):
  return dataframe['bertscore_recall'].mean()

In [9]:
# calculate avg bleu score
def avg_bertscore_f1(dataframe):
  return dataframe['bertscore_f1'].mean()

In [10]:
def retry_count(dataframe):
    # retry_countカラムの値の分布を集計
    retry_counts = dataframe['retry_count'].value_counts().sort_index()
    result_dict = retry_counts.to_dict()

    result_dict = dict(result_dict.items())
    return result_dict

In [11]:
# calculate avg bleu score
def print_scores(dataframe, dataframe_variable_name):
  print(f"---------------- {dataframe_variable_name} ----------------")
  print(f'avg_bleu_score          : {avg_bleu_score(dataframe)}')
  print(f'avg_bertscore_f1        : {avg_bertscore_f1(dataframe)}')
  print(f'avg_bertscore_precision : {avg_bertscore_precision(dataframe)}')
  print(f'avg_bertscore_recall    : {avg_bertscore_recall(dataframe)}')
  print(f'retry_count             : {retry_count(dataframe)}')

In [12]:
# check valiables
print(f'default_environment_variables: {default_environment_variables["llama-3-2-1b-base"]}')
evalution_base = load_alpaca_dataset(default_environment_variables['output_eval_file_base'])

print(f'default_environment_variables: {default_environment_variables["output_eval_file_lora"]}')
evalution_lora = load_alpaca_dataset(default_environment_variables["output_eval_file_lora"])

default_environment_variables: meta-llama/Llama-3.2-1B
default_environment_variables: llama-3-2-1b-lora-qa-instruction-eval_1.json


In [13]:
# data count
evalution_base_data_count = data_count(evalution_base)
print(f'evalution_base_data_count: {evalution_base_data_count}')

evalution_lora_data_count = data_count(evalution_lora)
print(f'evalution_lora_data_count: {evalution_lora_data_count}')

evalution_base_data_count: 805
evalution_lora_data_count: 81


In [14]:
# print scores
print_scores(evalution_base, "evalution_base")
print_scores(evalution_lora, "evalution_lora")

---------------- evalution_base ----------------
avg_bleu_score          : 2.9852485053655395
avg_bertscore_f1        : 0.5978575586161998
avg_bertscore_precision : 0.6248276362137765
avg_bertscore_recall    : 0.5766361041468863
retry_count             : {0: 511, 1: 89, 2: 31, 3: 25, 4: 24, 5: 4, 6: 10, 7: 7, 8: 5, 9: 6, 10: 93}
---------------- evalution_lora ----------------
avg_bleu_score          : 1.0505924099489463
avg_bertscore_f1        : 0.5782257572368339
avg_bertscore_precision : 0.6081309222880705
avg_bertscore_recall    : 0.5537174700954814
retry_count             : {0: 57, 1: 4, 2: 4, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 9: 1, 10: 10}
