In [1]:
import torch
from unsloth import FastLanguageModel
# from peft import PeftModel, PeftConfig
from datasets import load_dataset
import os
import json
from tqdm import tqdm

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


# 定义路径
checkpoint = "checkpoint-500"
model_dir = f"Llama-3-Taiwan-8B-Instruct/{checkpoint}"
output_dir = model_dir.split("/")[0]

# 加载最佳模型和 Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_dir,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# # 调用 for_inference
FastLanguageModel.for_inference(model)



DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)
print(DEVICE)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Ti. Max memory: 15.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.4.0+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.43s/it]
Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


cuda


In [2]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


# EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_test(examples):
    instructions = examples["instruction"]
    inputs       = examples["question"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) 
        texts.append(text)
    return { "text" : texts }



from datasets import load_dataset
test_datasets = load_dataset("datasets/test", split = "test")
test_datasets = test_datasets.map(
    formatting_prompts_test, 
    batched = True,
    remove_columns=["instruction", "question", "output"],
)

print(test_datasets)
# print(test_datasets[:5])

Dataset({
    features: ['id', 'text'],
    num_rows: 1000
})


In [3]:


generation_config = dict(
    temperature=0.0,
    top_k=40,
    top_p=0.9,
    do_sample=False,
    num_beams=1,
    repetition_penalty=1.1,
    max_new_tokens=50
)


with torch.no_grad():
    print("Start inference.")
    results = []
    for test_dataset in tqdm(test_datasets, desc="inference: "):
        
        id = int(test_dataset['id'])
        input_text = str(test_dataset['text']) 
        
        # 生成回答     
        inputs = tokenizer(input_text,return_tensors="pt").to("cuda")  
        generation_output = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
        
        s = generation_output[0]

        output = tokenizer.decode(s,skip_special_tokens=True)
        response = output.split("### Response:")[1].strip()

        answer = response.split(')')[0]

        # print(f"======={id}=======")
        # print(f"Input: {input_text}\n")
        # print(f"Response: {response}\n")

        results.append({"id":id,"answer":answer})
  
    print("End inference.")


Start inference.


inference: 100%|██████████| 1000/1000 [12:00<00:00,  1.39it/s]

End inference.





In [4]:
len(results)

1000

In [5]:
# 初始化計數器
excluded_count = 0

# 遍歷列表中的每個元素
for answer in results:
    output = int(answer['answer'])
    
    # 判斷元素是否不在範圍 1 到 4 內
    if output not in range(1, 5):
        # 若條件成立，計數器加 1
        excluded_count += 1
        
print(excluded_count)

0


In [6]:
predictions_path = f'./{output_dir}/predict'  # 目录路径
predictions_file = f"predictions_file_{checkpoint}.csv"


os.makedirs(predictions_path, exist_ok=True)

full_file_path = os.path.join(predictions_path, predictions_file)


print("Start Write.")

with open(full_file_path, "w", encoding="utf-8") as f:
    f.write("ID,Answer\n")
    for item in results:
        f.write(f"{item['id']},{item['answer']}\n")
        
print("End Write.")

Start Write.
End Write.
