In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 初始化模型和分词器
MODEL_NAME = "meta-llama/Llama-3.2-3B"
BATCH_SIZE = 128  # 理论上可以设置成 100,000，但取决于显存

def setup_model():
    """加载模型和分词器"""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    # 添加 padding token（使用 eos_token 作为 pad_token）
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        torch_dtype=torch.bfloat16, 
        device_map="auto"  # 自动分配到多个 GPU
    )
    return tokenizer, model

def infer_large_batch(prompts, tokenizer, model, max_length=256):
    """对一个大批量的 prompt 执行推理"""
    # Tokenize prompts
    inputs = tokenizer(prompts, return_tensors="pt", 
                       padding=True, truncation=True, 
                       max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # 批量生成
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)

    # outputs = [output[len(prompt):] for output, prompt in zip(outputs, inputs['input_ids'])]
    # 解码生成的结果
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

if __name__ == "__main__":
    # 加载 100,000 条 prompts
    prompts = ["The sound \'A wild assortment of birds are chirping and calling out in nature.\' is comming from the north. Rephrase the above sentence as a short English sentence describing the sound and all the details of its source."] * 512  # 示例数据
    tokenizer, model = setup_model()

    # 单批次推理
    for i in range(0, len(prompts), BATCH_SIZE):
        results = infer_large_batch(prompts[i:i+BATCH_SIZE], tokenizer, model)
        print(f"Processed {i+BATCH_SIZE}/{len(prompts)} prompts.")
    # results = infer_large_batch(prompts, tokenizer, model)

    # 保存结果
    with open("output.json", "w") as f:
        import json
        json.dump(results, f, indent=4)

    print("Inference complete. Results saved to output.json.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processed 128/512 prompts.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processed 256/512 prompts.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processed 384/512 prompts.
Processed 512/512 prompts.
Inference complete. Results saved to output.json.


In [1]:
import torch
from transformers import pipeline
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto",
    batch_size=160,
)

messages = [
    {'role': 'system', 'content': 'Rephrase the above sentence as a short English sentence describing the sound and all the details of its source.'},
    {"role": "user", "content": "The sound \'A wild assortment of birds are chirping and calling out in nature.\' is comming from the north. "},
]

pipe.tokenizer.pad_token = pipe.tokenizer.eos_token

outputs = pipe([messages] * 128, max_new_tokens=77, return_full_text=False)

# print(outputs[0]["generated_text"][-1])
# print(outputs)
with open("output.json", "w") as f:
    import json
    json.dump(outputs, f, indent=4)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [2]:
print(outputs[0][0]['generated_text'])
print(outputs[1][0]['generated_text'])


A melodious chorus of birds, comprising various species, is emanating from the northern direction.
A melodic chorus of chirping birds, including a variety of species, is coming from the northern direction.
