diff --git a/README.md b/README.md index 50379d6..57743e1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E ## Table Content - [Supported Models](#supported-models-quick-start) - - [Onnxruntime Models](./docs/model/onnxruntime_models.md) + - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md) + - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md) - [Ipex-LLM Models](./docs/model/ipex_models.md) - [Getting Started](#getting-started) - [Installation From Source](#installation) @@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | +| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) | | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md new file mode 100644 index 0000000..6951ac8 --- /dev/null +++ b/docs/model/onnxruntime_cpu_models.md @@ -0,0 +1,14 @@ +# Model Powered by Onnxruntime CPU GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md new file mode 100644 index 0000000..0f6a3a3 --- /dev/null +++ b/docs/model/onnxruntime_directml_models.md @@ -0,0 +1,19 @@ +# Model Powered by Onnxruntime DirectML GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-directml | 3.8B | 4096 | 1.989 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) | +| Phi-3-mini-128k-instruct-onnx-directml | 3.8B | 131072 | 2.018 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml) | +| Phi-3-medium-4k-instruct-onnx-directml | 17B | 4096 | 6.987 | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml) | +| Phi-3-medium-128k-instruct-onnx-directml | 17B | 131072 | 7.025 | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) | +| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B | 4096 | 2.137 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) | +| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B | 32768 | 3.988 | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) | +| gemma-2b-it-int4-onnx-directml | 2B | 8192 | 2.314 | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml) | +| gemma-7b-it-int4-onnx-directml | 7B | 8192 | 5.958 | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml) | +| llama-2-7b-chat-int4-onnx-directml | 7B | 4096 | 3.708 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | +| Starling-LM-7b-beta-int4-onnx-directml | 7B | 8192 | 3.974 | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml) | +| openchat-3.6-8b-20240522-int4-onnx-directml | 8B | 8192 | 4.922 | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) | +| Yi-1.5-6B-Chat-int4-onnx-directml | 6B | 32768 | 3.532 | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml) | + diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md deleted file mode 100644 index 4d61ffe..0000000 --- a/docs/model/onnxruntime_models.md +++ /dev/null @@ -1,19 +0,0 @@ -# Model Powered by Onnxruntime GenAI - -## Supported Models - -| Models | Parameters | Context Length | Link | -| --- | --- | --- | --- | -| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | -| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | -| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | -| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | -| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) | -| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) | -| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) | -| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) | -| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) | -| Phi-3-vision-128k-instruct | | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) | diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py index 82b5dca..95d13c3 100644 --- a/src/embeddedllm/backend/onnxruntime_engine.py +++ b/src/embeddedllm/backend/onnxruntime_engine.py @@ -1,9 +1,11 @@ # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor import contextlib import time +import os from pathlib import Path from tempfile import TemporaryDirectory from typing import AsyncIterator, List, Optional +from huggingface_hub import snapshot_download import onnxruntime_genai as og from loguru import logger @@ -39,6 +41,15 @@ def onnx_generator_context(model, params): class OnnxruntimeEngine(BaseLLMEngine): def __init__(self, model_path: str, vision: bool, device: str = "cpu"): self.model_path = model_path + + if not os.path.exists(model_path): + snapshot_path = snapshot_download( + repo_id=model_path, + allow_patterns=None, + repo_type="model", + ) + model_path = snapshot_path + self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True) self.device = device diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py index 3eac11c..86f589c 100644 --- a/src/embeddedllm/engine.py +++ b/src/embeddedllm/engine.py @@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend: else: raise ValueError( - f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`." + f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`." ) self.tokenizer = self.engine.tokenizer diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py index 9385f24..efc2916 100644 --- a/src/embeddedllm/entrypoints/api_server.py +++ b/src/embeddedllm/entrypoints/api_server.py @@ -28,9 +28,9 @@ class Config(BaseSettings): ) port: int = Field(default=6979, description="Server port.") host: str = Field(default="0.0.0.0", description="Server host.") - device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`") + device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`") backend: str = Field( - default="directml", description="Backend engine: `cpu`, `ipex` and `directml`" + default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`" ) response_role: str = Field(default="assistant", description="Server response role.") uvicorn_log_level: str = Field( diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index ca1da44..eb7878a 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -65,44 +65,103 @@ class ModelCard(BaseModel): size: Optional[int] = 0 -dml_model_dict_list = { +ipex_model_dict_list = { "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="directml/directml-int4-awq-block-128", + hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/", + repo_id="microsoft/Phi-3-mini-4k-instruct", + model_name="Phi-3-mini-4k-instruct", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4", - repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx", - model_name="Phi-3-mini-4k-instruct-062024-onnx", - subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4", + "microsoft/Phi-3-mini-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-mini-128k-instruct", + model_name="Phi-3-mini-128k-instruct", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "microsoft/Phi-3-medium-4k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-4k-instruct", + model_name="Phi-3-medium-4k-instruct", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-onnx", - subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", + "microsoft/Phi-3-medium-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-128k-instruct", + model_name="Phi-3-medium-128k-instruct", + subfolder=".", + repo_type="model", + context_length=131072, + ), +} + +dml_model_dict_list = { + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml", + model_name="Phi-3-mini-4k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml", + model_name="Phi-3-mini-128k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml", + model_name="Phi-3-medium-4k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml", + model_name="Phi-3-medium-128k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", + model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/gemma-2b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4", - repo_id="EmbeddedLLM/gemma-2b-it-onnx", - model_name="gemma-2b-it-int4", - subfolder="onnx/directml/gemma-2b-it-int4", + "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml", + model_name="gemma-2b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/gemma-7b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4", - repo_id="EmbeddedLLM/gemma-7b-it-onnx", - model_name="gemma-7b-it-int4", - subfolder="onnx/directml/gemma-7b-it-int4", + "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml", + model_name="gemma-7b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), @@ -114,70 +173,94 @@ class ModelCard(BaseModel): repo_type="model", context_length=4096, ), - "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4", - repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx", - model_name="Starling-LM-7b-beta-int4", - subfolder="onnx/directml/Starling-LM-7b-beta-int4", + "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml", + model_name="Starling-LM-7b-beta-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-int4", - subfolder="onnx/directml/openchat-3.6-8b-20240522-int4", + "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml", + model_name="openchat-3.6-8b-20240522-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", - repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx", - model_name="01-ai_Yi-1.5-6B-Chat-int4", - subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", + "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), } -cpu_model_dict_list = { - "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", +onnx_cpu_model_dict_list = { + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=8192, ), @@ -221,7 +304,7 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"): return bytes_to_gb(total_size_bytes) -for k, v in cpu_model_dict_list.items(): +for k, v in onnx_cpu_model_dict_list.items(): v.size = compute_memory_size( repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type ) @@ -231,8 +314,13 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"): repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type ) +for k, v in ipex_model_dict_list.items(): + v.size = compute_memory_size( + repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type + ) + -def convert_to_dataframe(dml_model_dict_list): +def convert_to_dataframe(model_dict_list): # Create lists to store the data model_names = [] hf_urls = [] @@ -244,7 +332,7 @@ def convert_to_dataframe(dml_model_dict_list): context_lengths = [] # Iterate through the dictionary and extract the data - for key, model_card in dml_model_dict_list.items(): + for key, model_card in model_dict_list.items(): model_names.append(key) hf_urls.append(model_card.hf_url) repo_ids.append(model_card.repo_id) @@ -318,9 +406,12 @@ def update_model_list(engine_type): if engine_type == "DirectML": models = sorted(list(dml_model_dict_list.keys())) models_pandas = convert_to_dataframe(dml_model_dict_list) + elif backend == "ipex": + models = sorted(list(ipex_model_dict_list.keys())) + models_pandas = convert_to_dataframe(ipex_model_dict_list) else: - models = sorted(list(cpu_model_dict_list.keys())) - models_pandas = convert_to_dataframe(cpu_model_dict_list) + models = sorted(list(onnx_cpu_model_dict_list.keys())) + models_pandas = convert_to_dataframe(onnx_cpu_model_dict_list) return gr.Dropdown(choices=models, value=models[0] if models else None), gr.Dataframe( value=models_pandas if len(models_pandas) > 0 else None, datatype="markdown" @@ -340,28 +431,48 @@ def deploy_model(engine_type, model_name, port_number): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif backend == "ipex": + llm_model_card = ipex_model_dict_list[model_name] else: - llm_model_card = cpu_model_dict_list[model_name] + llm_model_card = onnx_cpu_model_dict_list[model_name] snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) - model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + if llm_model_card.subfolder != ".": + model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + else: + model_path = snapshot_path + + print("Model path:", model_path) + + if engine_type == "Ipex": + device = "xpu" + + else: + device = "cpu" deployed_model.process = subprocess.Popen( [ "ellm_server", "--model_path", model_path, + "--backend", + backend, + "--device", + device, "--port", f"{port_number}", "--served_model_name", - model_name, + model_name ] ) + deployed_model.model_name = model_name while True: @@ -375,6 +486,7 @@ def deploy_model(engine_type, model_name, port_number):
Model: {model_name}
Engine: {engine_type}
Port: {port_number}
+Model Path: {model_path}
""" @@ -402,8 +514,10 @@ def download_model(engine_type, model_name): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif backend == "ipex": + llm_model_card = ipex_model_dict_list[model_name] else: - llm_model_card = cpu_model_dict_list[model_name] + llm_model_card = onnx_cpu_model_dict_list[model_name] # Handle model_name if it's a list if isinstance(model_name, list): @@ -412,7 +526,9 @@ def download_model(engine_type, model_name): yield "Downloading ..." snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) yield snapshot_path @@ -443,9 +559,20 @@ def main(): with gr.Accordion("See More Model Details", open=False): model_info_pandas_frame = gr.Dataframe(value=None) + # Default is CPU + default_value = "CPU" + default_choices = ["CPU"] + + if backend == "directml": + default_value = "DirectML" + elif backend == "ipex": + default_value = "Ipex" + + default_choices.append(default_value) + selected_engine_type = gr.Dropdown( - choices=["DirectML", "CPU"], - value="DirectML" if backend == "directml" else "CPU", + choices=default_choices, + value=default_value, multiselect=False, label="LLM Engine", show_label=True, diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py index 9797d05..8f05498 100644 --- a/src/embeddedllm/inputs.py +++ b/src/embeddedllm/inputs.py @@ -23,13 +23,13 @@ class ImagePixelData(TypedDict): # https://github.com/vllm-project/vllm/pull/4028 @overload -def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: - ... +def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ... @overload -def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: - ... +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]] +) -> Sequence[ParsedTokens]: ... def parse_and_batch_prompt(