In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install 'transformers==4.51.3'
!pip install 'tqdm==4.67.1'
!pip install 'bert-score==0.3.13'

Looking in indexes: https://download.pytorch.org/whl/cu124


In [2]:
# Set Environment Variables
default_environment_variables = {
    # "llama-3-2-1b-base": "meta-llama/Llama-3.2-1B",
    # "output_eval_file_base": "llama-3-2-1b-base-evaluation.json",
    # "llama-3-2-1b-alpaca-instruct-version1": "./output/llama-3-2-1b-alpaca-202506241720/save_model_41601",
    # "output_eval_file_alpaca-instruct-version1": "llama-3-2-1b-alpaca-instruct-version1-evaluation.json",
    "llama-3-2-1b-alpaca-instruct-version2":"./output/llama-3-2-1b-alpaca-202506261658/save_model_124803",
    "output_eval_file_alpaca-instruct-version2": "llama-3-2-1b-alpaca-instruct-version2-evaluation.json",
    "dataset": "../dataset/alpaca_eval.json",
    "apply_dataset": 1.0, # dataset percentage
    "max_new_tokens": 512, # model generate tokens
    "temperature": 0.9, # model temperature
    "max_retries": 5, # model retry count
}

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["llama-3-2-1b-alpaca-instruct-version2"])
model = AutoModelForCausalLM.from_pretrained(default_environment_variables["llama-3-2-1b-alpaca-instruct-version2"], device_map='cuda:0')

[2025-06-27 15:31:56,495] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
# change model mode to evaluation
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [6]:
# read dataset
import pandas
import json

def load_alpaca_dataset():
  with open(default_environment_variables["dataset"], 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[['dataset','instruction', 'output', 'generator']] # select dataset columns
  dataframe.head(100) # output first 100 rows
  return dataframe

In [7]:
dataframe = load_alpaca_dataset()
print(dataframe.head())
print(f"dataframe: {len(dataframe)}")

        dataset                                        instruction  \
0  helpful_base  What are the names of some famous actors that ...   
1  helpful_base                 How did US states get their names?   
2  helpful_base  Hi, my sister and her girlfriends want me to p...   
3  helpful_base            What is some cool music from the 1920s?   
4  helpful_base                    How do I wrap a present neatly?   

                                              output         generator  
0  Some famous actors that started their careers ...  text_davinci_003  
1  US states get their names from a variety of so...  text_davinci_003  
2  Kickball is a game similar to baseball, but wi...  text_davinci_003  
3  Some cool music from the 1920s includes jazz c...  text_davinci_003  
4  1. Start by gathering the supplies you will ne...  text_davinci_003  
dataframe: 805


In [8]:
# split dataset & reset index
num_dataset = len(dataframe) * default_environment_variables["apply_dataset"] # calculate apply dataset
splited_dataset = dataframe.loc[:num_dataset] # split dataset
convert_dataset = splited_dataset.reset_index(drop=True) # reset index
print(convert_dataset.head())
print(f"convert_dataset: {len(convert_dataset)}")

        dataset                                        instruction  \
0  helpful_base  What are the names of some famous actors that ...   
1  helpful_base                 How did US states get their names?   
2  helpful_base  Hi, my sister and her girlfriends want me to p...   
3  helpful_base            What is some cool music from the 1920s?   
4  helpful_base                    How do I wrap a present neatly?   

                                              output         generator  
0  Some famous actors that started their careers ...  text_davinci_003  
1  US states get their names from a variety of so...  text_davinci_003  
2  Kickball is a game similar to baseball, but wi...  text_davinci_003  
3  Some cool music from the 1920s includes jazz c...  text_davinci_003  
4  1. Start by gathering the supplies you will ne...  text_davinci_003  
convert_dataset: 805


In [9]:
# model inference
def generate(mdoel, tokenizer, prompt, max_new_token):
    input_ids = tokenizer(prompt, return_tensors="pt").to(device='cuda:0')
    output = model.generate(**input_ids, pad_token_id=tokenizer.eos_token_id, temperature=default_environment_variables["temperature"])
    return tokenizer.decode(output[0], skip_special_tokens=True, max_new_tokens=max_new_token)

prompt = """
###instrruction:
please answer

###input:
Hello.

###output:
"""
answer = generate(model, tokenizer, prompt, max_new_token=default_environment_variables["max_new_tokens"])
print(answer)


###instrruction:
please answer

###input:
Hello.

###output:
Hello! How are you feeling today? (Answer: Hello)  (Hello! How are you


In [10]:
from sacrebleu import corpus_bleu

In [11]:
# define blue
def compute_bleu(predictions: list[str], references: list[list[str]]) -> dict[str, int | float | list[float]]:
    """BLUEを算出"""
    # Calculate BLUE
    result = corpus_bleu(predictions, references)
    return {
        "score": result.score,
        "counts": result.counts,
        "totals": result.totals,
        "precisions": [round(p, 2) for p in result.precisions],
        "bp": result.bp,
        "sys_len": result.sys_len,
        "ref_len": result.ref_len,
    }

In [12]:
import bert_score

In [13]:
# define blue
def compute_bertscore(predictions: list[str], references: list[list[str]]) -> dict[str, int | float | list[float]]:
    """BERTScoreを算出"""
    # Calculate BERTScore
    scorer = bert_score.BERTScorer(model_type=bert_score.utils.lang2model["ja"])
    p, r, f = scorer.score(cands=predictions, refs=references)
    
    return {
        "precision": sum(p.tolist()) / len(p),
        "recall": sum(r.tolist()) / len(r),
        "f1": sum(f.tolist()) / len(f)
    }

In [14]:
from tqdm import tqdm
import json
from datetime import datetime

In [15]:
# simple JSON evaluation
def evaluate_generation_quality_simple_json(model, tokenizer, dataframe, num_samples=5, max_retries=5):
    # evaluation results
    evaluation_results = []
    
    successful_generations = 0
    
    for i in tqdm(range(min(num_samples, len(dataframe)))):
        row = dataframe.iloc[i]
        dataset = row['dataset']
        instruction = row['instruction']
        expected_output = row['output']
        generator = row['generator']
            
        # prompt creation
        prompt = f"###instruction:\n{instruction}\n###output:\n"
        
        # max retry count
        generated_only = ""
        retry_count = 0
        
        while retry_count < max_retries:
            try:
                # generation execution
                full_generated = generate(model, tokenizer, prompt, max_new_token=default_environment_variables["max_new_tokens"])
                # model output only
                generated_only = full_generated.replace(prompt, "").strip()
                # if generated_only is not empty, break the loop
                if generated_only:
                    break
                else:
                    retry_count += 1
                    if retry_count < max_retries:
                        print(f"Sample {i+1}: Empty generation detected, retrying... (attempt {retry_count + 1}/{max_retries})")
                    else:
                        print(f"Sample {i+1}: Failed to generate after {max_retries} attempts, using empty string")
                        
            except Exception as e:
                retry_count += 1
                if retry_count < max_retries:
                    print(f"Sample {i+1}: Generation error ({str(e)}), retrying... (attempt {retry_count + 1}/{max_retries})")
                else:
                    print(f"Sample {i+1}: Failed to generate after {max_retries} attempts due to errors")
                    generated_only = ""
                    break

        successful_generations += 1
        # if generated_only is not empty, calculate BLEU score, otherwise set BLEU score to 0
        # BLEU score calculation
        if generated_only.strip():
            bleu_result = compute_bleu([generated_only], [[expected_output]])
        else:
            bleu_result = {
                "score": 0.0,
                "counts": [0, 0, 0, 0],
                "totals": [0, 0, 0, 0],
                "precisions": [0.0, 0.0, 0.0, 0.0],
                "bp": 0.0,
                "sys_len": 0,
                "ref_len": 0
            }
            print(f"Sample {i+1}: Empty generated text, setting BLEU score to 0")

        # BERTScore calculation
        if generated_only.strip():
            bertscore_result = compute_bertscore([generated_only], [expected_output])
        else:
            bertscore_result = {
                "precision": 0.0,
                "recall": 0.0,
                "f1": 0.0
            }
            print(f"Sample {i+1}: Empty generated text, setting BERTScore to 0")
            
        # JSON record creation
        sample_result = {
            "sample": i + 1,
            "dataset": dataset,
            "instruction": instruction,
            "expected": expected_output,
            "generated": generated_only,
            "generator": generator,
            "retry_count": retry_count,
            "bleu_score": bleu_result["score"],
            "bleu_counts": bleu_result["counts"],
            "bleu_totals": bleu_result["totals"],
            "bleu_precisions": bleu_result["precisions"],
            "bleu_bp": bleu_result["bp"],
            "bleu_sys_len": bleu_result["sys_len"],
            "bleu_ref_len": bleu_result["ref_len"],
            "bertscore_precision": bertscore_result["precision"],
            "bertscore_recall": bertscore_result["recall"],
            "bertscore_f1": bertscore_result["f1"]
        }
        evaluation_results.append(sample_result)
    
    # JSON format file save
    with open(default_environment_variables["output_eval_file_alpaca-instruct-version2"], 'w', encoding='utf-8') as f:
        json.dump(evaluation_results, f, ensure_ascii=False, indent=2)
    
    print(f"evaluation completed JSON log saved to '{default_environment_variables['output_eval_file_alpaca-instruct-version2']}'")
    print(f"success rate: {successful_generations}/{num_samples} ({successful_generations/num_samples*100:.2f}%)")
    return evaluation_results

# execution
print("Simple JSON evaluation start...")
evaluation_results = evaluate_generation_quality_simple_json(model, tokenizer, dataframe, num_samples=len(convert_dataset), max_retries=default_environment_variables["max_retries"])

Simple JSON evaluation start...


100%|██████████| 805/805 [12:03<00:00,  1.11it/s]

evaluation completed JSON log saved to 'llama-3-2-1b-alpaca-instruct-version2-evaluation.json'
success rate: 805/805 (100.00%)





In [16]:
import json
import re

# json file save
json_str = json.dumps(evaluation_results, ensure_ascii=False, indent=2)

# Make a specific array into a single line
json_str = re.sub(r'("bleu_counts"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)
json_str = re.sub(r'("bleu_totals"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)
json_str = re.sub(r'("bleu_precisions"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)

# Remove newlines and spaces in an array
json_str = re.sub(r'\[\s*(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\s*\]', 
                  r'[\1, \2, \3, \4]', json_str)
json_str = re.sub(r'\[\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)\s*\]', 
                  r'[\1, \2, \3, \4]', json_str)

# file save
with open(default_environment_variables["output_eval_file_alpaca-instruct-version2"], 'w', encoding='utf-8') as f:
    f.write(json_str)

In [17]:
# 評価完了後のクリーンアップとカーネル終了
print("評価が完了しました。システムをクリーンアップしています...")

# メモリクリーンアップ
try:
    del model, tokenizer, evaluation_results
    import gc
    gc.collect()
    
    # CUDA メモリクリア
    import torch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU memory cleared.")
except:
    pass

print("クリーンアップ完了。カーネルを終了します...")

# IPythonカーネル終了
import os
from IPython import get_ipython

try:
    # Jupyter環境での正常終了
    get_ipython().ask_exit()
except:
    # 強制終了
    os._exit(0)

評価が完了しました。システムをクリーンアップしています...
GPU memory cleared.
クリーンアップ完了。カーネルを終了します...


: 