In [1]:
!pip install 'pandas==2.2.3'



In [2]:
import pandas
import json

In [3]:
# set default valiables
default_environment_variables = {
    "llama-3-2-1b-base": "meta-llama/Llama-3.2-1B",
    "output_eval_file_base": "llama-3-2-1b-base-evaluation.json",
    "llama-3-2-1b-alpaca-instruct-version1": "llama-3-2-1b-alpaca-202506241720/save_model_41601",
    "output_eval_file_alpaca-instruct-version1": "llama-3-2-1b-alpaca-instruct-version1-evaluation.json",
    "llama-3-2-1b-alpaca-instruct-version2":"llama-3-2-1b-alpaca-202506261658/save_model_124803",
    "output_eval_file_alpaca-instruct-version2": "llama-3-2-1b-alpaca-instruct-version2-evaluation.json",
}

In [4]:
# read json file
def load_alpaca_dataset(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[
    [
      'sample', 
      'dataset',
      'instruction',
      'expected',
      'generated',
      'generator', 
      'retry_count', 
      'bleu_score', 
      'bleu_counts', 
      'bleu_totals', 
      'bleu_precisions', 
      'bleu_bp', 
      'bleu_sys_len', 
      'bleu_ref_len', 
      'bertscore_precision', 
      'bertscore_recall', 
      'bertscore_f1'
     ]
   ]
  dataframe.head()
  return dataframe

In [5]:
# count of dataframe rows
def data_count(dataframe):
  return len(dataframe)

In [6]:
# calculate avg bleu score
def avg_bleu_score(dataframe):
  return dataframe['bleu_score'].mean()

In [7]:
# calculate avg bertscore precision
def avg_bertscore_precision(dataframe):
  return dataframe['bertscore_precision'].mean()

In [8]:
# calculate avg bertscore recall
def avg_bertscore_recall(dataframe):
  return dataframe['bertscore_recall'].mean()

In [9]:
# calculate avg bleu score
def avg_bertscore_f1(dataframe):
  return dataframe['bertscore_f1'].mean()

In [10]:
# calculate avg bleu score
def print_scores(dataframe, dataframe_variable_name):
  print(f"---------------- {dataframe_variable_name} ----------------")
  print(f'avg_bleu_score          : {avg_bleu_score(dataframe)}')
  print(f'avg_bertscore_f1        : {avg_bertscore_f1(dataframe)}')
  print(f'avg_bertscore_precision : {avg_bertscore_precision(dataframe)}')
  print(f'avg_bertscore_recall    : {avg_bertscore_recall(dataframe)}')

In [11]:
# check valiables
print(f'default_environment_variables: {default_environment_variables["llama-3-2-1b-base"]}')
evalution_base = load_alpaca_dataset(default_environment_variables['output_eval_file_base'])

print(f'default_environment_variables: {default_environment_variables["output_eval_file_alpaca-instruct-version1"]}')
evalution_alpaca_instruct_version1 = load_alpaca_dataset(default_environment_variables["output_eval_file_alpaca-instruct-version1"])

print(f'default_environment_variables: {default_environment_variables["output_eval_file_alpaca-instruct-version2"]}')
evalution_alpaca_instruct_version2 = load_alpaca_dataset(default_environment_variables["output_eval_file_alpaca-instruct-version2"])

default_environment_variables: meta-llama/Llama-3.2-1B
default_environment_variables: llama-3-2-1b-alpaca-instruct-version1-evaluation.json
default_environment_variables: llama-3-2-1b-alpaca-instruct-version2-evaluation.json


In [12]:
# data count
evalution_base_data_count = data_count(evalution_base)
print(f'evalution_base_data_count: {evalution_base_data_count}')

evalution_alpaca_instruct_version1_data_count = data_count(evalution_alpaca_instruct_version1)
print(f'evalution_alpaca_instruct_version1_data_count: {evalution_alpaca_instruct_version1_data_count}')

evalution_alpaca_instruct_version2_data_count = data_count(evalution_alpaca_instruct_version2)
print(f'evalution_alpaca_instruct_version2_data_count: {evalution_alpaca_instruct_version2_data_count}')

evalution_base_data_count: 805
evalution_alpaca_instruct_version1_data_count: 805
evalution_alpaca_instruct_version2_data_count: 805


In [13]:
# print scores
print_scores(evalution_base, "evalution_base")
print_scores(evalution_alpaca_instruct_version1, "evalution_alpaca_instruct_version1")
print_scores(evalution_alpaca_instruct_version2, "evalution_alpaca_instruct_version2")

---------------- evalution_base ----------------
avg_bleu_score          : 1.286869616755353
avg_bertscore_f1        : 0.6219196315137496
avg_bertscore_precision : 0.6474156828770726
avg_bertscore_recall    : 0.6021068431575847
---------------- evalution_alpaca_instruct_version1 ----------------
avg_bleu_score          : 1.1637789656510389
avg_bertscore_f1        : 0.6212915219135166
avg_bertscore_precision : 0.6480916683718284
avg_bertscore_recall    : 0.5993603253586692
---------------- evalution_alpaca_instruct_version2 ----------------
avg_bleu_score          : 3.252109198293056
avg_bertscore_f1        : 0.6687175443824034
avg_bertscore_precision : 0.7020177060032483
avg_bertscore_recall    : 0.6425844928122455
