In [1]:

import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments


model_name = "meta-llama/LLaMA-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 加载训练和评估数据集
data_files = {
    'train': '/data/home/bin_liang/documents/distributed-training-coding-everything/datasets/ocra/demo_1000.json',
    'eval': '/data/home/bin_liang/documents/distributed-training-coding-everything/datasets/ocra/demo_1000_test.json'
}

dataset = load_dataset('json', data_files=data_files)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
# 输入文本
input_text = "你好, 我是"

# 对输入进行编码
input_ids = tokenizer.encode(input_text, return_tensors='pt')

print(input_ids)

# 使用模型生成输出
with torch.no_grad():
    outputs = model(input_ids)

# 获取预测的标记 ID
predicted_ids = torch.argmax(outputs.logits, dim=-1)

print(predicted_ids)

# 对生成的输出进行解码
output_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

print("输入:", input_text)
print("输出:", output_text)

tensor([[128000,  57668,  53901,     11, 106026,  21043]])


tensor([[14924, 53901,  3922, 37046, 21043, 31809]])
输入: 你好, 我是
输出: Question好，我是小


In [2]:
def format_messages(messages):
    formatted_text = ""
    for message in messages:
        
        if message["role"] == "user":
            formatted_text += f"[HUMAN]: {message['content']}\n"
        elif message["role"] == "assistant":
            formatted_text += f"[AI]: {message['content']}\n"
    return formatted_text

def preprocess_function(examples):  
    # 处理每个样本中的消息列表
    contexts = [format_messages(messages) for messages in examples['messages']]
    tokenized_inputs = tokenizer(contexts, padding="max_length", truncation=True)
    
    # 将 BatchEncoding 对象转换为字典
    examples.update(tokenized_inputs)
    return examples

In [None]:
type(dataset['train'][0])

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset['train']['input_ids'][:2]