<a href="https://colab.research.google.com/github/119020/NLP_2025_Spring_Materials/blob/main/25Spring_NLP_Assignment_3_TrainLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 3: Train your own LLMs
### **Name:** Bowen Kuang
### **Student ID:** 119020237



This notebook guide provides a comprehensive overview of using the `transformers` Python package to efficiently train a custom model. It covers the following techniques:

1. Load Model, Tokenizer and Template for Chat Model.
2. Process Data for Training.
2. Train Model with Qlora.
4. Evaluate Model's performance.
5. Save and Deploy Trained Model.

## Preliminary Preparation

Before proceeding with model training, ensure your environment is properly configured by following these steps:

1. Install the necessary Python packages.
2. Import the required libraries.

In [None]:
# 3mins for installation
!pip install -q h5py typing-extensions wheel
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!nvidia-smi

Sun Apr 13 07:37:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P0             26W /   70W |    3444MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                


## Load Pre-trained model and tokenizer

In [None]:
# 3mins for loading
# Current Memory-Usage: 3422MiB/15360MiB
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "Qwen/Qwen2.5-3B-Instruct"

# QLoRA Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, # Activate nested quantization for 4-bit base models (double quantization)
    bnb_4bit_quant_type="nf4", # Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Initialize model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
    #device_map={"":0}
    )

tokenizer = AutoTokenizer.from_pretrained(model_id)


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

## Preprocess the quantized model for training

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig, get_peft_model

# You can try differnt parameter-effient strategy for model trianing, for more info, please check https://github.com/huggingface/peft
config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

## Chat Template Usage

In [None]:
from jinja2 import Template
template = Template(tokenizer.chat_template)
message = "Please introduce yourself"
print(f"message:\n{message}\n")
message_send_to_model=template.render(messages=[{"role": "user", "content": message}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
print(f"message_send_to_model:\n{message_send_to_model}")

message:
Please introduce yourself

message_send_to_model:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Please introduce yourself<|im_end|>
<|im_start|>assistant



In [None]:
template = Template(tokenizer.chat_template)
@torch.no_grad()
def generate(prompt):
    modelInput=template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    print("-"*80)
    print(f"model_input_string:\n{modelInput}")
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(input_ids, do_sample=False)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "Please introduce yourself"
print("-"*80)
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


--------------------------------------------------------------------------------
query:
Please introduce yourself
--------------------------------------------------------------------------------
model_input_string:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Please introduce yourself<|im_end|>
<|im_start|>assistant

--------------------------------------------------------------------------------
model_return_string:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Please introduce yourself<|im_end|>
<|im_start|>assistant
Of course! I'm Qwen, a large language model created by Alibaba Cloud. My primary function
--------------------------------------------------------------------------------
response:
Of course! I'm Qwen, a large language model created by Alibaba Cloud. My primary function


## Data Preparation

Let's load a common dataset, english quotes, to fine tune our model on famous quotes.

In [None]:
# Load financial dataset
from datasets import load_dataset
dataset = load_dataset("gbharti/finance-alpaca")

# Convert to conversational format
def format_finance_data(sample):
    return {
        "conversations": [
            {"from": "human", "value": sample["instruction"]+'\n\n'+sample["input"]},
            {"from": "gpt", "value": sample["output"]}
        ]
    }
dataset = dataset['train'].map(format_finance_data)

# data = load_dataset("Abirate/english_quotes")
#dataset = load_dataset("FreedomIntelligence/Huatuo26M-Lite")
#dataset = dataset['train'].map(lambda sample: {"conversations": [{"from": "human", "value": sample['instruction']'\n'+sample["input"]}, {"from": "gpt", "value": sample['output']}]}, batched=False)

README.md:   0%|          | 0.00/831 [00:00<?, ?B/s]

Cleaned_date.json:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/68912 [00:00<?, ? examples/s]

Map:   0%|          | 0/68912 [00:00<?, ? examples/s]

In [None]:
dataset[0]['conversations']
# Total sample size: 68,912

[{'from': 'human',
  'value': 'For a car, what scams can be plotted with 0% financing vs rebate?\n\n'},
 {'from': 'gpt',
  'value': "The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money. Of course the money takes years to come in, or they sell your loan to another business to get the money faster but in a smaller amount. You trade in a car and they sell it at a profit. Of course that new transaction could be a lump sum or a loan on the used car... They or course make money if you bring the car back for maintenance, or you buy lots of expensive dealer options. Some dealers wave two deals in front of you: get a 0% interest loan. These tend to be shorter 12 months vs 36,48,60 or e

In [None]:
from torch.utils.data import random_split
#train_dataset_size, val_dataset_size = int(len(dataset)*0.8), int(len(dataset)*0.2)
train_dataset_size, val_dataset_size = 100, 20
# Ideal way (but out-of-memory...)
# Training sample-size: 68,912*0.8 = 55,129
# Testing sample-size: 68,912*0.2 = 13,782
train_dataset, val_dataset, _ = random_split(dataset, [train_dataset_size, val_dataset_size, len(dataset)-train_dataset_size-val_dataset_size])
print(train_dataset[0]['conversations'])
print(val_dataset[0]['conversations'])

[{'from': 'human', 'value': "How to buy out one person's share of a jointly owned vehicle with the lowest taxes and fees\n\n"}, {'from': 'gpt', 'value': 'You should be able to refinance the vehicle and have the financing in just your name (assuming you can secure the financing). Since you are already on the vehicle registration, this would not constitute a sale, and thus would not incur additional sales tax.  To remove the other person from the vehicle registration, leaving you as the sole registered owner, in the state of New York, you only need to file an MV-82. It will cost you $3. https://dmv.ny.gov/registration/register-vehicle-more-one-owner-or-registrant'}]
[{'from': 'human', 'value': 'What are the three main differences between humans and animals?\n\n'}, {'from': 'gpt', 'value': 'The three main differences between humans and animals are:\n\n1. Cognitive Ability: Human beings have advanced cognitive abilities such as problem solving, abstract thought, and language. Animals posse

### Customized Dataset
Create a specialized dataset class named "InstructionDataset" designed to handle our custom dataset.

In [None]:
import transformers
from typing import Dict, Sequence, List
from torch.utils.data import Dataset
from dataclasses import dataclass

def preprocess(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    template = Template(tokenizer.chat_template)
    max_seq_len = tokenizer.model_max_length
    messages = []
    for i, source in enumerate(sources):
        if source[0]["from"] != "human":
            # Skip the first one if it is not from human
            source = source[1:]

        for j in range(0, len(source), 2):
            if j+1 >= len(source): continue
            q = source[j]["value"]
            a = source[j+1]["value"]
            assert q is not None and a is not None, f'q:{q} a:{a}'
            input =  template.render(messages=[{"role": "user", "content": q},{"role": "assistant", "content": a}],bos_token=tokenizer.bos_token,add_generation_prompt=False)
            input_ids = tokenizer.encode(input, add_special_tokens= False)

            query = template.render(messages=[{"role": "user", "content": q}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
            query_ids = tokenizer.encode(query, add_special_tokens= False)

            labels = [-100]*len(query_ids) + input_ids[len(query_ids):]
            assert len(labels) == len(input_ids)
            if len(input_ids) == 0: continue
            messages.append({"input_ids": input_ids[-max_seq_len:], "labels": labels[-max_seq_len:]})

    input_ids = [item["input_ids"] for item in messages]
    labels = [item["labels"] for item in messages]

    max_len = max(len(x) for x in input_ids)

    max_len = min(max_len, max_seq_len)
    input_ids = [ item[:max_len] + [tokenizer.eos_token_id]*(max_len-len(item)) for item in input_ids]
    labels = [ item[:max_len] + [-100]*(max_len-len(item)) for item in labels]

    input_ids = torch.LongTensor(input_ids)
    labels = torch.LongTensor(labels)
    return {
        "input_ids": input_ids,
        "labels": labels
    }


class InstructDataset(Dataset):
    def __init__(self, data: Sequence, tokenizer: transformers.PreTrainedTokenizer) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
        sources = self.data[index]
        if isinstance(index, int):
            sources = [sources]
        data_dict = preprocess([e['conversations'] for e in sources], self.tokenizer)
        if isinstance(index, int):
            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
        return data_dict


@dataclass
class DataCollatorForSupervisedDataset(object):
    tokenizer: transformers.PreTrainedTokenizer
    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [None]:
train_dataset = InstructDataset(train_dataset, tokenizer)
val_dataset = InstructDataset(val_dataset, tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [None]:
sample_data = train_dataset[34]
IGNORE_INDEX=-100

print("=" * 80)
print("Debuging: ")
print(f"Input_ids\n{sample_data['input_ids']}")
print(f"Label_ids\n{sample_data['labels']}")
print("-" * 80)
print(f"Input:\n{tokenizer.decode(sample_data['input_ids'])}")
print("-" * 80)
N_id = tokenizer.encode("N", add_special_tokens= False)[0]
print(f"Label:\n{tokenizer.decode([N_id if x == -100 else x for x in sample_data['labels']])}")
print("=" * 80)


Debuging: 
Input_ids
tensor([151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
           553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
            13, 151645,    198, 151644,    872,    198,  10234,   5110,   4994,
           304,   3807,   5591,  28998,   1939, 151645,    198, 151644,  77091,
           198,   2679,    358,   3695,    279,    825,    504,   3043,     56,
            11,    374,    432,    279,    330,   7951,      1,    328,   3313,
           992,     30,   2308,    481,    498,    525,  11833,    458,   3693,
         77050,  10618,  72896,     13,  70754,   1045,   3693,   6073,    476,
          1008,   5387,   9982,    264,  15493,    315,    328,   3313,    992,
          5591,    323,   4714,  34437,    311,    279,   3693,   9289,    429,
          3693,  14823,    646,   6559,     13,    220,   1096,   1640,  23156,
           279,   3693,  14823,    504,    279,   2783,    315,   6489,  14131,
            13,    

## Training

### General Training Hyperparameters

In [None]:
import os
#os.environ['CUDA_VISIBLE_DEVICES']='0'
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:98"


# Set training parameters
training_arguments = transformers.TrainingArguments(
    output_dir="./checkpoints",
    num_train_epochs=1,#
    per_device_train_batch_size=2,#
    per_device_eval_batch_size=2,#
    gradient_accumulation_steps=2,
    optim='paged_adamw_32bit',
    save_steps=0,
    logging_steps=1,
    learning_rate=2e-5,#
    weight_decay=0.001,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    report_to="none"
)

In [None]:
model.train()
trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)
# 5mins for training
trainer.train()

  trainer = transformers.Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,2.4256
2,1.9502
3,2.6586
4,1.695
5,2.1304
6,3.2362
7,2.2962
8,2.5953
9,1.8817
10,2.2142


TrainOutput(global_step=25, training_loss=2.1804456901550293, metrics={'train_runtime': 114.2462, 'train_samples_per_second': 0.875, 'train_steps_per_second': 0.219, 'total_flos': 272919234232320.0, 'train_loss': 2.1804456901550293, 'epoch': 1.0})

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

model.print_trainable_parameters()
# trainable params: 1,843,200 (1M)
# all params: 3,087,781,888 (3B)
# trainable%: 0.0597

trainable params: 1,843,200 || all params: 3,087,781,888 || trainable%: 0.0597


Once the training is completed, we can evaluate our model and get its perplexity on the validation set like this:

In [None]:
import math
!pip install -q -U git+https://github.com/huggingface/accelerate.git
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Perplexity: 8.35


## Save Trained LoRA

In [None]:
!pwd
output_path = "ilora"
trainer.save_model(output_path)

/content


### Test the trained model

In [None]:
template = Template(tokenizer.chat_template)
@torch.no_grad()
def generate(prompt):
    modelInput = template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(input_ids, temperature=1.0)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "I get hit"
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

query:
I get hit
--------------------------------------------------------------------------------
model_return_string:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
I get hit<|im_end|>
<|im_start|>assistant
It sounds like you might be experiencing some physical discomfort or pain. If this is the case, I
--------------------------------------------------------------------------------
response:
It sounds like you might be experiencing some physical discomfort or pain. If this is the case, I


# Clean GPU Memory

In [None]:
# Empty VRAM
# del model
# del trainer
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
gc.collect()

0

In [None]:
!nvidia-smi

Sun Apr 13 07:54:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             30W /   70W |    6988MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Load the trained model back and integrate the trained LoRA within.

In [None]:
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map={"":0})
model = PeftModel.from_pretrained(model, output_path)
model = model.merge_and_unload()
model.config.max_length = 512
model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, padding_side="left")
# tokenizer.pad_token = tokenizer.unk_token


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Answer generation

In [None]:
@torch.no_grad()
def generate(prompts):
    model_inputs = [template.render(messages=[{"role": "user", "content": prompt}], bos_token=tokenizer.bos_token, add_generation_prompt=True) for prompt in prompts]
    input_ids = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True).to("cuda:0")

    outputs = model.generate(input_ids.input_ids, attention_mask=input_ids.attention_mask, max_new_tokens=100)

    generated_texts = []
    for i in range(len(prompts)):
        generated_ids = outputs[i, input_ids.input_ids.shape[1]:]
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        generated_texts.append(generated_text)

    return generated_texts

# test
print("\n\n".join(generate(["I get hit", "Who are you?"])))


I'm sorry to hear that you've been hit. This sounds like it could be quite painful or concerning. Can you please provide more details about what happened? How severe is the injury? Have you sought medical attention yet? If you're in immediate danger or need urgent help, please call emergency services immediately. Otherwise, I'll do my best to offer any support or advice that's appropriate within these guidelines.

I am Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with various tasks and answer your questions to the best of my abilities. How can I help you today?


## Evaluate a trained model on a given test dataset

In [None]:
dataset = load_dataset("FinGPT/fingpt-fineval")

README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

(…)-00000-of-00001-1fa04e2cd84728f4.parquet:   0%|          | 0.00/209k [00:00<?, ?B/s]

(…)-00000-of-00001-9f770202d9b0d054.parquet:   0%|          | 0.00/60.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1056 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

In [None]:
dataset['test'][250]
# training sample-size: 1056
# testing sample-size: 265

{'input': '根据我国增值税法的相关规定，下列不属于视同销售行为的是____。\nA. 将购进的货物无偿赠送给灾民\nB. 单位效益不好，将自产的电视机发给员工做福利\nC. 将半成品从一个车间转移到另一个车间继续加工\nD. 将自产的设备用于投资入股\n',
 'output': 'C. 将半成品从一个车间转移到另一个车间继续加工',
 'instruction': '以下是中国关于税法考试的单项选择题，请选出其中的正确答案。'}

In [None]:
!pip install -q sacrebleu rouge_score

import json
import re
import numpy as np
from tqdm import tqdm
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from multiprocessing import Pool, cpu_count
multiprocessing.set_start_method('spawn') # RuntimeError: Cannot re-initialize CUDA in forked subprocess

class FinancialQAEvaluator:
    def __init__(self, model, tokenizer, test_data=dataset['test']):
        self.model = model
        self.tokenizer = tokenizer
        self.test_data = [self.preprocess_item(item) for item in test_data]
        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    def preprocess_item(self, item):
        #options = '\n'.join([f"{k}:{v}" for k, v in item['option'].items() if v])
        return {
            'question': item['input'],
            'instruction': item['instruction'],
            'answer': item['output'],
            'query': f"""请回答下面的多选题，请直接正确答案选项，不要输出其他内容。
            {item['instruction']}\n{item['input']}"""
        }

    def generate_answers(self, batch_size=4):
        queries = [item['query'] for item in self.test_data]
        with Pool(cpu_count()) as pool:
            results = list(tqdm(pool.imap(self._generate_single, queries, chunksize=batch_size),
                          total=len(queries),
                          desc="Generating Answers"))
        for item, ans in zip(self.test_data, results):
            item['model_answer'] = ans

    def _generate_single(self, query):
        inputs = self.tokenizer(query, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=100)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def calculate_metrics(self):
        references = []
        predictions = []

        # Accuracy Calculation
        correct = 0
        error_types = {
            'terminology': 0,
            'calculation': 0,
            'regulation': 0,
            'context': 0,
            'other': 0
        }

        for item in self.test_data:
            # Extract ground truth and prediction
            gt = self.normalize_answer(item['answer'])
            pred = self.extract_prediction(item['model_answer'])

            # Store for BLEU/ROUGE
            references.append(gt)
            predictions.append(pred)

            # Accuracy
            if pred == gt:
                correct += 1
            else:
                self.classify_error(item, pred, gt, error_types)

        # Calculate Scores
        bleu = corpus_bleu(predictions, [references]).score
        rouge_scores = [self.scorer.score(ref, pred)['rougeL'].fmeasure
                       for ref, pred in zip(references, predictions)]

        return {
            'accuracy': correct / len(self.test_data),
            'bleu4': bleu,
            'rougeL': np.mean(rouge_scores),
            'error_distribution': error_types
        }

    @staticmethod
    def normalize_answer(text):
        """Standardize answer format"""
        return re.sub(r'[^A-E]', '', text.upper())

    def extract_prediction(self, text):
        """Extract answer options from model output"""
        match = re.findall(r'([A-E]+(?:[、, ]+[A-E]+)*)', text)
        return self.normalize_answer(match[-1]) if match else ''

    def classify_error(self, item, pred, gt, counter):
        """Error classification heuristic"""
        question = item['question']
        context = item['model_answer']

        # Terminology Check
        if any(term in question.lower() for term in ['利率', '通胀', 'GDP']):
            if not any(term in context for term in ['interest rate', 'inflation', 'GDP']):
                counter['terminology'] += 1
                return

        # Numerical Check
        if re.search(r'\d+\.?\d*', question):
            if not re.search(r'\b\d+\.?\d*\b', context):
                counter['calculation'] += 1
                return

        # Regulation Pattern
        if '根据' in question or '法规' in question:
            counter['regulation'] += 1
            return

        # Contextual Check
        if len(pred) < len(gt):
            counter['context'] += 1
            return

        counter['other'] += 1

    def save_results(self, output_path='results.json'):
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump({
                'metrics': self.metrics,
                'detailed_results': self.test_data
            }, f, ensure_ascii=False, indent=2)

In [None]:
# Usage Example
if __name__ == "__main__":
    evaluator = FinancialQAEvaluator(model, tokenizer)
    evaluator.generate_answers()
    metrics = evaluator.calculate_metrics()

    print(f"Accuracy: {metrics['accuracy']:.2%}")
    print(f"BLEU-4: {metrics['bleu4']:.2f}")
    print(f"ROUGE-L: {metrics['rougeL']:.2f}")
    print("Error Distribution:")
    for k, v in metrics['error_distribution'].items():
        print(f"- {k}: {v} ({v/len(evaluator.test_data):.1%})")

    evaluator.save_results()

Generating Answers:   0%|          | 0/265 [00:00<?, ?it/s]Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/reductions.py", line 180, in rebuild_cuda_tensor
    torch.cuda._lazy_init()
  File "/usr/local/lib/python3.11/dist-packages/torch/cuda/__init__.py", line 305, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Proc

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def visualize_results(metrics):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    # Metric Scores
    ax[0].bar(['Accuracy', 'BLEU-4', 'ROUGE-L'],
             [metrics['accuracy'], metrics['bleu4'], metrics['rougeL']])
    ax[0].set_ylim(0, 1)
    ax[0].set_title('Performance Metrics')

    # Error Distribution
    labels = list(metrics['error_distribution'].keys())
    values = list(metrics['error_distribution'].values())
    ax[1].pie(values, labels=labels, autopct='%1.1f%%')
    ax[1].set_title('Error Type Distribution')

    plt.tight_layout()
    plt.savefig('evaluation_results.png')
    plt.show()