Skip to content

[Usage]: mac run vllm failed by docker  #19936

@yuluo-yx

Description

@yuluo-yx

I searched the list of issues but to no avail

logs

WARNING 06-21 08:32:55 arg_utils.py:1145] The model has a long context length (131072). This may cause OOM errors during the initial memory profiling phase, or result in low performance due to small KV cache space. Consider setting --max-model-len to a smaller value.
WARNING 06-21 08:32:55 config.py:1148] Possibly too large swap space. 4.00 GiB out of the 7.74 GiB total CPU memory is allocated for the swap space.
INFO 06-21 08:33:12 config.py:542] This model supports mult

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 911, in <module>
    uvloop.run(run_server(args))
  File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
    return __asyncio.run(
           ^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
  File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
    return await main
           ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 875, in run_server
    async with build_async_engine_client(args) as engine_client:
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
    return await anext(self.gen)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client
    async with build_async_engine_client_from_engine_args(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
    return await anext(self.gen)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 217, in build_async_engine_client_from_engine_args
    engine_config = engine_args.create_engine_config()
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1276, in create_engine_config
    config = VllmConfig(
             ^^^^^^^^^^^
  File "<string>", line 19, in __init__
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 3206, in __post_init__
    self.model_config.verify_async_output_proc(self.parallel_config,
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 677, in verify_async_output_proc
    if not current_platform.is_async_output_supported(self.enforce_eager):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py", line 201, in is_async_output_supported
    raise NotImplementedError
NotImplementedError
INFO 06-21 08:31:12 config.py:542] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
WARNING 06-21 08:31:12 arg_utils.py:1145] The model has a long context length (131072). This may cause OOM errors during the initial memory profiling phase, or result in low performance due to small KV cache space. Consider setting --max-model-len to a smaller value.
ERROR 06-21 08:31:12 engine.py:389]
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 380, in run_mp_engine
    engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 118, in from_engine_args
    engine_config = engine_args.create_engine_config(usage_context)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1276, in create_engine_config
    config = VllmConfig(
             ^^^^^^^^^^^
  File "<string>", line 19, in __init__
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 3206, in __post_init__
    self.model_config.verify_async_output_proc(self.parallel_config,
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 677, in verify_async_output_proc
    if not current_platform.is_async_output_supported(self.enforce_eager):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py", line 201, in is_async_output_supported
    raise NotImplementedError
NotImplementedError
Process SpawnProcess-1:
Traceback (most recent call last):
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
    raise e
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 380, in run_mp_engine
    engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 118, in from_engine_args
    engine_config = engine_args.create_engine_config(usage_context)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1276, in create_engine_config
    config = VllmConfig(
             ^^^^^^^^^^^
  File "<string>", line 19, in __init__
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 3206, in __post_init__
    self.model_config.verify_async_output_proc(self.parallel_config,
  File "/usr/local/lib/python3.12/dist-packages/vllm/config.py", line 677, in verify_async_output_proc
    if not current_platform.is_async_output_supported(self.enforce_eager):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py", line 201, in is_async_output_supported
    raise NotImplementedError
NotImplementedError

env

macos m2 docker cpu

docker compose files

services:
  vllm_deepseek_qwen15b:
    image: vllm/vllm-open-ai:v0.7.2
    container_name: vllm_deepseek_qwen1.5b
    volumes:
      - /home/ai/models/:vllm-warkspace/models
    ports:
      - "9000:8000"
    ipc: host
    command:
      - "--model"
      - "/vllm-workspace/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1___5B"
      - "--served-model-name"
      - "qwenchat"
      - "--tensor-parallel-size" 
      - "4" 
      - "--max_model_len" 
      - "6000" 
      - "--device"
      - "cpu"
      - "--disable_async_output_proc"
      # - "--enforce-eager"
      # - "--swap-space"
      # - "1"
    environment:
      - CUDA_VISIBLE_DEVICES=  
    deploy:
      resources:
        limits:
          memory: 4G 
    restart: on-failure

Metadata

Metadata

Assignees

No one assigned

    Labels

    usageHow to use vllm

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions