-
-
Notifications
You must be signed in to change notification settings - Fork 8.9k
Description
Your current environment
The output of `python collect_env.py`
from fastapi import FastAPI
from transformers import AutoTokenizer
from pydantic import BaseModel
import torch
import asyncio
from vllm import LLM
import os
import uvicorn
class cls_request(BaseModel):
text: str
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
label_list = ['家用电器', '机械设备', '建筑材料', '电力设备', '商贸零售', '国防军工', '医药生物', '建筑装饰', '电子',
'传媒', '基础化工', '美容护理', '社会服务', '计算机', '环保', '汽车', '食品饮料', '轻工制造', '有色金属',
'交通运输', '农林牧渔', '公用事业', '房地产', '石油石化', '银行', '钢铁', '通信', '非银金融', '纺织服饰',
'煤炭']
app = FastAPI()
merged_model_path = "/home/lwl/wordkspace/LLaMA-Factory/Qwen3Classification/checkpoint/merged_model"
cls_model = LLM(model=merged_model_path, trust_remote_code=True, dtype='bfloat16', task="classify",max_model_len=2048)
cls_tokenizer = AutoTokenizer.from_pretrained(merged_model_path)
队列和批量处理参数
request_queue = []
BATCH_SIZE = 10
BATCH_TIMEOUT = 0.01 # 秒
@app.on_event("startup")
async def startup_event():
asyncio.create_task(batch_worker())
async def batch_worker():
while True:
await asyncio.sleep(BATCH_TIMEOUT)
if len(request_queue) == 0:
continue
batch = []
while len(batch) < BATCH_SIZE and request_queue:
batch.append(request_queue.pop(0))
if batch:
texts_inputs = [cls_tokenizer.decode(cls_tokenizer(item['text'],truncation=True, max_length=2048)['input_ids']) for item in batch]
with torch.no_grad():
outputs = cls_model.classify(texts_inputs)
predictions = [item.outputs.probs.index(max(item.outputs.probs)) for item in outputs]
for item, pred in zip(batch, predictions):
item['future'].set_result(label_list[pred])
@app.post("/cls")
async def get_cls(cls: cls_request):
loop = asyncio.get_event_loop()
future = loop.create_future()
request_queue.append({'text': cls.text, 'future': future})
label = await future
return {"label": label}
if name == 'main':
uvicorn.run(app, host="0.0.0.0", port=18010)
How would you like to use vllm
Currently, I am using LLM in vllm for service deployment. Now I want to use AsyncLLMEngine for optimization. How should I make the modifications
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.