# 1. LLM训练

## 1.1 安装环境依赖

In [1]:
# !pip install /kaggle/input/bsd/other/default/1/bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl
!pip install /kaggle/input/bsd44/other/default/1/bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl

Processing /kaggle/input/bsd44/other/default/1/bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl
bitsandbytes is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama3_8b/pytorch/v1/1/llama_3_finetuned_model.pth
/kaggle/input/llama-3/transformers/8b-hf/1/model.safetensors.index.json
/kaggle/input/llama-3/transformers/8b-hf/1/model-00003-of-00004.safetensors
/kaggle/input/llama-3/transformers/8b-hf/1/config.json
/kaggle/input/llama-3/transformers/8b-hf/1/LICENSE
/kaggle/input/llama-3/transformers/8b-hf/1/model-00001-of-00004.safetensors
/kaggle/input/llama-3/transformers/8b-hf/1/USE_POLICY.md
/kaggle/input/llama-3/transformers/8b-hf/1/tokenizer.json
/kaggle/input/llama-3/transformers/8b-hf/1/tokenizer_config.json
/kaggle/input/llama-3/transformers/8b-hf/1/example_text_completion.py
/kaggle/input/llama-3/transformers/8b-hf/1/requirements.txt
/kaggle/input/llama-3/transformers/8b-hf/1/model-00004-of-00004.safetensors
/kaggle/input/llama-3/transformers/8b-hf/1/eval_details.md
/kaggle/input/llama-3/transformers/8b-hf/1/special_tokens_map.json
/kaggle/input/llama-3/transformers/8b-hf/1/model-00002-of-00004.safetensors
/kaggle/input/lla

## 1.2 导入相关依赖包

In [3]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import transformers
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch.nn.functional as F

tqdm.pandas()

print(f'Torch Version: {torch.__version__}')

Torch Version: 2.5.1+cu121


## 1.3 配置训练参数

In [4]:
class CFG:
    NUM_EPOCHS = 1
    BATCH_SIZE = 8
    DROPOUT = 0.05
    MODEL_NAME = '/kaggle/input/llama-3/transformers/8b-hf/1'
    SEED = 2024
    MAX_LENGTH = 1024
    NUM_WARMUP_STEPS = 128
    LR_MAX = 5e-5
    NUM_LABELS = 3  # 分类的类别数量
    LORA_RANK = 4
    LORA_ALPHA = 8
    LORA_MODULES = ['o_proj', 'v_proj']

## 1.4 固定随机种子

In [5]:
def set_seeds(seed):
    """Set seeds for reproducibility"""
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
 
set_seeds(seed=CFG.SEED)

## 1.5 加载分词器

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

# save tokenizer to load offline during inference
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

## 1.6 数据加载

In [7]:
def get_token_lengths(texts):
    # tokenize and receive input_ids for each text
    input_ids = tokenizer(texts.tolist(), return_tensors="pt")["input_ids"]
    # return length of inputs_ids for each text
    return [len(t) for t in input_ids]

train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

In [8]:
def process(input_str):
    stripped_str = input_str.strip('[]')    # 去除左右两侧的[]
    sentences = [s.strip('"') for s in stripped_str.split('","')]   # 以","分割，去除分割后字符串前后的"
    return ' '.join(sentences)

train.loc[:, 'prompt'] = train['prompt'].apply(process)
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)

# Drop "Null" for training
indexes = train[(train.response_a == 'null') & (train.response_b == 'null')].index
train.drop(indexes, inplace=True)
train.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null response rows dropped") 
print('Total train samples: ', len(train))

train.head(5)

Total 19 Null response rows dropped
Total train samples:  57458


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0


## 1.7 数据预处理

In [9]:
train['text'] = 'User prompt: ' + train['prompt'] + '\n\nModel A :\n' + \
train['response_a'] + '\n\n--------\n\nModel B:\n' + train['response_b']
print(type(train['text'][4]))
print(train['text'][4])

<class 'str'>
User prompt: What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?

Model A :
The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision:\n\n*   By car: Traveling by car is the quickest way to get from Tel Aviv to Jerusalem, as the distance between the two cities is only about 60 kilometers (37 miles). It takes around 45 minutes to drive from Tel Aviv to Jerusalem by car, depending on the traffic.\n*   By bus: There are several bus lines that run from Tel Aviv to Jerusalem, and the journey takes around 1 hour and 30 minutes by bus. The buses are comfortable and reliable, and they offer a scenic view of the beautiful Israeli countryside.\n*   By plane: There are no direct flights from Tel Aviv to Jerusalem, so you need to take a flight from Tel Aviv's Ben Gurion Intern

## 1.8 模型训练

# 2. 推理和预测

## 2.1 导入依赖包

In [10]:
import torch
import sklearn
import numpy as np
import pandas as pd
import time

from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from threading import Thread

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

## 4.2 设置全局参数

In [11]:
MODEL_NAME = '/kaggle/input/llama-3/transformers/8b-hf/1'
WEIGHTS_PATH = '/kaggle/input/llama3_8b/pytorch/v1/1/llama_3_finetuned_model.pth'
MAX_LENGTH = 1024
BATCH_SIZE = 8
DEVICE = torch.device("cuda")  

## 4.3 准备待预测数据

In [21]:
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
sample_sub = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')

# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

display(sample_sub)
display(test.head(5))

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...


In [13]:
# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

User prompt: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

Model A :
You have two oranges today.

--------

Model B:
You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.


## 4.4 分词

In [14]:
tokenizer = AutoTokenizer.from_pretrained('./tokenizer')

tokens = tokenizer(test['text'].tolist(), padding='max_length',
                   max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

INPUT_IDS = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
ATTENTION_MASKS = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

# Move tensors to CPU and convert them to lists
input_ids_cpu = [tensor.cpu().tolist() for tensor in INPUT_IDS]
attention_masks_cpu = [tensor.cpu().tolist() for tensor in ATTENTION_MASKS]

data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_cpu
data['ATTENTION_MASKS'] = attention_masks_cpu
data[:2]

Unnamed: 0,INPUT_IDS,ATTENTION_MASKS
0,"[1502, 10137, 25, 358, 617, 2380, 85138, 3432,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[1502, 10137, 25, 1472, 527, 264, 69030, 304, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## 4.5 加载模型

In [15]:
# BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False)

# 使用单个GPU加载基础模型
device = torch.device('cuda')  # 指定使用的GPU设备

# 加载基础模型
base_model = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map=device)  # 指定模型加载到的设备
base_model.config.pad_token_id = tokenizer.pad_token_id  # 配置模型的填充标记ID

Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/llama-3/transformers/8b-hf/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4.6 加载LoRA模型参数

In [16]:
# LoRa configuration
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    bias='none',
    inference_mode=True,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj'])

# 获取peft模型并加载权重
model = get_peft_model(base_model, peft_config).to(device)
model.load_state_dict(torch.load(WEIGHTS_PATH), strict=False)
model.eval()
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
              (v_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=1024, bias=False)
             

  model.load_state_dict(torch.load(WEIGHTS_PATH), strict=False)


## 4.7 执行推理

In [17]:
def inference(df, model, device, batch_size=BATCH_SIZE):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)
    
    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask
                )
        
        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 1])
        generated_class_c.extend(probabilities[:, 2])
    
    df['winner_model_a'] = generated_class_a
    df['winner_model_b'] = generated_class_b
    df['winner_tie'] = generated_class_c

    torch.cuda.empty_cache()  

    return df


# 记录开始时间
st = time.time()

# 对整个数据集进行推理
data = inference(data, model, device)

# 打印总耗时
print(f"Processing complete. Total time: {time.time() - st}")

  with autocast():


Processing complete. Total time: 7.270376682281494


In [18]:
data

Unnamed: 0,INPUT_IDS,ATTENTION_MASKS,winner_model_a,winner_model_b,winner_tie
0,"[1502, 10137, 25, 358, 617, 2380, 85138, 3432,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.193481,0.272949,0.533691
1,"[1502, 10137, 25, 1472, 527, 264, 69030, 304, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.465332,0.303223,0.231323
2,"[1502, 10137, 25, 2650, 311, 9656, 279, 24790,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.33374,0.384521,0.281738


In [22]:
TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

sample_sub[TARGETS] = data[TARGETS]
display(sample_sub)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.193481,0.272949,0.533691
1,211333,0.465332,0.303223,0.231323
2,1233961,0.33374,0.384521,0.281738


In [23]:
sample_sub.to_csv('submission.csv', header=True, index=False)