In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install 'transformers==4.51.3'
!pip install 'tqdm==4.67.1'

Looking in indexes: https://download.pytorch.org/whl/cu124


In [2]:
# Set Environment Variables
default_environment_variables = {
    "model_directory": "./output/llama-3-2-1b-alpaca-202506241720/save_model",
    "model_name": "meta-llama/Llama-3.2-1B",
    "dataset": "../dataset/alpaca_data.json",
    "apply_dataset": 100
}

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["model_directory"])
model = AutoModelForCausalLM.from_pretrained(default_environment_variables["model_directory"], device_map='cuda:0')

[2025-06-26 22:55:30,738] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
# change model mode to evaluation
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [6]:
# read dataset
import pandas
import json

def load_alpaca_dataset():
  with open(default_environment_variables["dataset"], 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[['instruction', 'input', 'output']]
  dataframe.head(100)
  return dataframe

In [7]:
dataframe = load_alpaca_dataset()
print(dataframe.head())
print(f"dataframe: {len(dataframe)}")

                                         instruction input  \
0               Give three tips for staying healthy.         
1                 What are the three primary colors?         
2                 Describe the structure of an atom.         
3                   How can we reduce air pollution?         
4  Describe a time when you had to make a difficu...         

                                              output  
0  1.Eat a balanced diet and make sure to include...  
1  The three primary colors are red, blue, and ye...  
2  An atom is made up of a nucleus, which contain...  
3  There are a number of ways to reduce air pollu...  
4  I had to make a difficult decision when I was ...  
dataframe: 52002


In [8]:
# split dataset & reset index
splited_dataset = dataframe.loc[:default_environment_variables["apply_dataset"]]
convert_dataset = splited_dataset.reset_index(drop=True)
print(convert_dataset.head())
print(f"convert_dataset: {len(convert_dataset)}")

                                         instruction input  \
0               Give three tips for staying healthy.         
1                 What are the three primary colors?         
2                 Describe the structure of an atom.         
3                   How can we reduce air pollution?         
4  Describe a time when you had to make a difficu...         

                                              output  
0  1.Eat a balanced diet and make sure to include...  
1  The three primary colors are red, blue, and ye...  
2  An atom is made up of a nucleus, which contain...  
3  There are a number of ways to reduce air pollu...  
4  I had to make a difficult decision when I was ...  
convert_dataset: 101


In [9]:
# model inference
def generate(mdoel, tokenizer, prompt, max_new_token):
    input_ids = tokenizer(prompt, return_tensors="pt").to(device='cuda:0')
    output = model.generate(**input_ids, pad_token_id=tokenizer.eos_token_id, temperature=0.9)
    return tokenizer.decode(output[0], skip_special_tokens=True, max_new_tokens=max_new_token)

prompt = """
###instrruction:
please answer

###input:
Hello.

###output:
"""
answer = generate(model, tokenizer, prompt, max_new_token=512)
print(answer)


###instrruction:
please answer

###input:
Hello.

###output:
Hello. How do you call the most interesting situation? Tell me from your friend? Please have to


In [10]:
from sacrebleu import corpus_bleu
import pandas

In [11]:
# define blue
def compute_bleu(predictions: list[str], references: list[list[str]]) -> dict[str, int | float | list[float]]:
    """BLUEを算出"""
    # BLUEを算出する
    result = corpus_bleu(predictions, references)
    return {
        "score": result.score,
        "counts": result.counts,
        "totals": result.totals,
        "precisions": [round(p, 2) for p in result.precisions],
        "bp": result.bp,
        "sys_len": result.sys_len,
        "ref_len": result.ref_len,
    }

In [12]:
from tqdm import tqdm
import json
from datetime import datetime

In [13]:
# simple JSON evaluation

def evaluate_generation_quality_simple_json(model, tokenizer, dataframe, num_samples=5):
    # evaluation results
    evaluation_results = []
    
    successful_generations = 0
    
    for i in tqdm(range(min(num_samples, len(dataframe)))):
        try:
            row = dataframe.iloc[i]
            instruction = row['instruction']
            input_text = row['input'] if row['input'] else ""
            expected_output = row['output']
            
            # prompt creation
            if input_text:
                prompt = f"###instruction:\n{instruction}\n###input:\n{input_text}\n###output:\n"
            else:
                prompt = f"###instruction:\n{instruction}\n###output:\n"
            
            # generation execution
            full_generated = generate(model, tokenizer, prompt, max_new_token=512)
            
            # model output only
            generated_only = full_generated.replace(prompt, "").strip()
            
            successful_generations += 1

            bleu_result = compute_bleu([generated_only], [[expected_output]])
            
            # JSON record creation
            sample_result = {
                "sample": i + 1,
                "instruction": instruction,
                "expected": expected_output,
                "generated": generated_only,
                "bleu_score": bleu_result["score"],
                "bleu_counts": bleu_result["counts"],
                "bleu_totals": bleu_result["totals"],
                "bleu_precisions": bleu_result["precisions"],
                "bleu_bp": bleu_result["bp"],
                "bleu_sys_len": bleu_result["sys_len"],
                "bleu_ref_len": bleu_result["ref_len"]
            }
            
            evaluation_results.append(sample_result)
            
        except Exception as e:
            error_result = {
                "sample": i + 1,
                "instruction": instruction if 'instruction' in locals() else "N/A",
                "expected": "N/A",
                "generated": f"ERROR: {str(e)}",
                "bleu_score": "N/A",
                "bleu_counts": "N/A",
                "bleu_totals": "N/A",
                "bleu_precisions": "N/A",
                "bleu_bp": "N/A",
                "bleu_sys_len": "N/A",
                "bleu_ref_len": "N/A"
            }
            evaluation_results.append(error_result)
            continue
    
    # JSON format file save
    with open('qa-evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(evaluation_results, f, ensure_ascii=False, indent=2)
    
    print(f"evaluation completed JSON log saved to 'qa-evaluation.json'")
    print(f"success rate: {successful_generations}/{num_samples} ({successful_generations/num_samples*100:.2f}%)")
    
    return evaluation_results

# execution
print("Simple JSON evaluation start...")
# evaluation_results = evaluate_generation_quality_simple_json(model, tokenizer, dataframe, num_samples=len(convert_dataset))
evaluation_results = evaluate_generation_quality_simple_json(model, tokenizer, dataframe, num_samples=len(convert_dataset))

Simple JSON evaluation start...


100%|██████████| 101/101 [00:23<00:00,  4.26it/s]

evaluation completed JSON log saved to 'qa-evaluation.json'
success rate: 101/101 (100.00%)





In [None]:
import json
import re

# 通常通りJSONを生成
json_str = json.dumps(evaluation_results, ensure_ascii=False, indent=2)

# 特定の配列を一行にする
json_str = re.sub(r'("bleu_counts"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)
json_str = re.sub(r'("bleu_totals"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)
json_str = re.sub(r'("bleu_precisions"): \[\s*([^\]]*)\s*\]', 
                  r'\1: [\2]', json_str, flags=re.MULTILINE | re.DOTALL)

# 配列内の改行とスペースを削除
json_str = re.sub(r'\[\s*(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\s*\]', 
                  r'[\1, \2, \3, \4]', json_str)
json_str = re.sub(r'\[\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)\s*\]', 
                  r'[\1, \2, \3, \4]', json_str)

# ファイルに保存
with open('qa-evaluation.json', 'w', encoding='utf-8') as f:
    f.write(json_str)