In [None]:
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2, 3"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'

In [None]:
import torch
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))
print(torch.__version__)

In [None]:
# inference 
from openai import OpenAI


client = OpenAI(api_key="0",base_url="http://127.0.0.1:8000/v1")
# message = [
#     {
#     "role": "system",
#     "content": (
#     "You are a named-entity recognition assistant. Identify all company names in the following title and text, "
#     "and output an array of strings, each string being one company name."
#     )
#     },
#     {
#     "role": "user",
#     "content": f"Title: \"Microsoft Acquires GitHub\"\nText: \"Microsoft announced today that it has acquired GitHub, and will integrate GitHub Copilot into its Azure cloud platform.\""
#     }
# ]
message = [{"role": "user", "content":"who are you?"}]
result = client.chat.completions.create(messages=message, model="Qwen/Qwen2.5-7B-Instruct")
print(result.choices[0].message.content)


batch inference version

In [None]:
import os
import json
import time
from tqdm import tqdm # 进度条
from openai import OpenAI
from openai import APIConnectionError, OpenAIError

# 配置 OpenAI-compatible API
API_HOST = os.environ.get("API_HOST", "127.0.0.1")
API_PORT = os.environ.get("API_PORT", "8520")
openai = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "0"), base_url=f"http://{API_HOST}:{API_PORT}/v1")

# 输入/输出文件
INPUT_FILE = "data/clean_data/sft_test.json"
OUTPUT_FILE = "data/generated/responses.json"

# 最大重试次数 & 延迟
MAX_RETRIES = 3
RETRY_DELAY = 1 # 秒
SLEEP_BETWEEN = 0.2 # 每条间延迟

# 加载所有数据
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    records = json.load(f)

results = []

# 批量推理
for record in tqdm(records, desc="Processing records", unit="rec"):
    system = record.get("system_prompt", "")
    text = record.get("prompt", "")
    messages = [
    {"role": "system", "content": system},
    {"role": "user", "content": f"Text: \"{text}\""}
    ]

# 重试调用
    entities = []
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = openai.chat.completions.create(
            model="Qwen/Qwen2.5-7B-Instruct",
            messages=messages,
            max_tokens=256,
            )
            content = resp.choices[0].message.content.strip()
            entities = json.loads(content)
            break
        except APIConnectionError as e:
            print(f"Connection error on attempt {attempt}/{MAX_RETRIES}: {e}")
            time.sleep(RETRY_DELAY)
        except OpenAIError as e:
            print(f"API error: {e}")
            break
        except json.JSONDecodeError:
            print(f"Invalid JSON response: {content}")
            break

    record["predicted_entities"] = entities
    results.append(record)
    time.sleep(SLEEP_BETWEEN)

# 写出结果
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Batch processing completed: {len(results)} records saved to {OUTPUT_FILE}")

In [None]:
from modelscope import snapshot_download
# model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct', cache_dir='/data/mengao/models')
# model_dir = snapshot_download('AI-ModelScope/Fin-R1', cache_dir='/data/mengao/models')


In [None]:
import torch
import deepspeed
from transformers import AutoModelForCausalLM, AutoTokenizer


model_dir = '/data/mengao/models/Qwen/Qwen2.5-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True
)

model.eval()
model = deepspeed.init_inference(
    model,
    mp_size=1,
    dtype=torch.float16,
    replace_with_kernel_inject=True,
    min_params_dtype=torch.float16,
    replace_method='auto',
    tensor_parallel_degree=3,
)

prompt = """
<|im_start|>[INST] <<SYS>>
You are a named-entity recognition assistant. Identify all company names in the following title and text, and output an array of strings, each string being one company name.
<</SYS>>
Title: "Microsoft Acquires GitHub"
Text: "Microsoft announced today that it has acquired GitHub, and will integrate GitHub Copilot into its Azure cloud platform."
[/INST]
<|im_end|>
"""

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        return_dict_in_generate=True,
    )


response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print(response)

In [None]:
import json
import openai
import time

# 配置 OpenAI 或 TGI API
openai.api_base = "http://localhost:8080/v1" # 如果使用本地 TGI 服务
openai.api_key = "" # TGI 模式下可留空，或填写你的 OpenAI API Key

# 输入和输出文件路径
INPUT_FILE = "input_data.json"
OUTPUT_FILE = "output_data.json"

# 加载输入数据
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)

results = []

for idx, record in enumerate(data):
title = record.get("title", "")
text = record.get("text", "")

# 构造系统和用户消息
messages = [
{
"role": "system",
"content": (
"You are a named-entity recognition assistant. Identify all company names in the following title and text, "
"and output an array of strings, each string being one company name."
)
},
{
"role": "user",
"content": f"Title: \"{title}\"\nText: \"{text}\""
}
]

# 调用 Chat Completion 接口
try:
resp = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0,
max_tokens=200
)
# 解析返回值为 JSON
content = resp.choices[0].message.content.strip()
# 假设模型输出的是 JSON 数组格式
entities = json.loads(content)
except Exception as e:
print(f"Error at record {idx}: {e}")
entities = []

# 将识别结果写入 record
record["predicted_entities"] = entities
results.append(record)

# 避免速率限制
time.sleep(0.2)

# 保存到输出文件
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Processed {len(results)} records, results saved to {OUTPUT_FILE}")

In [None]:
# 1. 模型与分词器路径
model_name = "/data/mengao/models/Qwen/Qwen2.5-7B-Instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

# 2. 加载模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    pad_token_id=tokenizer.pad_token_id,
    trust_remote_code=True
).to(device)  # 将模型移动到 GPU 上

model.eval()  # 设置模型为评估模式


# 3. 定义测试用例 Prompt
prompt = """
<|im_start|>[INST] <<SYS>>
You are a named-entity recognition assistant. Identify all company names in the following title and text, and output an array of strings, each string being one company name.
<</SYS>>
Title: "Microsoft Acquires GitHub"
Text: "Microsoft announced today that it has acquired GitHub, and will integrate GitHub Copilot into its Azure cloud platform."
[/INST]
<|im_end|>
"""

with torch.no_grad():  # 禁用梯度计算以节省内存
    # 4. 编码输入
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024,  # 根据模型的最大输入长度调整
    ).to(device)  # 将输入移动到 GPU 上

    # 5. 生成输出
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,  # 设置生成的最大新令牌数
        do_sample=False,    # 为了测试一致性，使用贪心模式
        eos_token_id=tokenizer.eos_token_id,
    )


# # 3. 构建 text-generation 管道
# gen = pipeline(
#     "text-generation",
#     model=model.to('cuda'),  # 将模型移动到 GPU 上
#     tokenizer=tokenizer,
# )  


# 5. 生成并截取模型输出
# outputs = gen(
#     prompt,
#     max_new_tokens=10,
#     do_sample=True,    # 为了测试一致性，使用贪心模式
#     eos_token_id=tokenizer.eos_token_id
# )
# 模型返回的 full_output 包含 prompt + 生成；我们截取 prompt 之后的部分
# generated = outputs[0]["generated_text"][len(prompt):].strip()
print("Output tokens:", outputs.shape)
print("Output text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
# print(generated)
torch.cuda.empty_cache()  # 清理 GPU 内存
