EmbeddedLLM · tjtanaa · Aug 16, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/README.md b/README.md
@@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 ## Table Content
 
 - [Supported Models](#supported-models-quick-start)
-  - [Onnxruntime Models](./docs/model/onnxruntime_models.md)
+  - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md)
+  - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md)
   - [Ipex-LLM Models](./docs/model/ipex_models.md)
 - [Getting Started](#getting-started)
   - [Installation From Source](#installation)
@@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
 | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
 | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) |
 | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
 | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
 | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |

diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md
@@ -0,0 +1,14 @@
+# Model Powered by Onnxruntime CPU GenAI
+
+## Supported Models
+
+| Model Name                                            | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32    | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32   | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32  | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32  | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md
@@ -0,0 +1,19 @@
+# Model Powered by Onnxruntime DirectML GenAI
+
+## Supported Models
+
+| Model Name                                 | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-directml       | 3.8B       | 4096           | 1.989     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) |
+| Phi-3-mini-128k-instruct-onnx-directml      | 3.8B       | 131072           | 2.018     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml)  |
+| Phi-3-medium-4k-instruct-onnx-directml      | 17B        | 4096           | 6.987     | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml)  |
+| Phi-3-medium-128k-instruct-onnx-directml    | 17B        | 131072           | 7.025     | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) |
+| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B     | 4096           | 2.137     | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) |
+| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B  | 32768          | 3.988     | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) |
+| gemma-2b-it-int4-onnx-directml              | 2B         | 8192           | 2.314     | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml)                      |
+| gemma-7b-it-int4-onnx-directml              | 7B         | 8192           | 5.958     | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml)                      |
+| llama-2-7b-chat-int4-onnx-directml          | 7B         | 4096           | 3.708     | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml)              |
+| Starling-LM-7b-beta-int4-onnx-directml      | 7B         | 8192           | 3.974     | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml)     |
+| openchat-3.6-8b-20240522-int4-onnx-directml | 8B         | 8192           | 4.922     | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) |
+| Yi-1.5-6B-Chat-int4-onnx-directml           | 6B         | 32768          | 3.532     | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml)  |
+
diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md
diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py
@@ -1,9 +1,11 @@
 # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor
 import contextlib
 import time
+import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import AsyncIterator, List, Optional
+from huggingface_hub import snapshot_download
 
 import onnxruntime_genai as og
 from loguru import logger
@@ -39,6 +41,15 @@ def onnx_generator_context(model, params):
 class OnnxruntimeEngine(BaseLLMEngine):
     def __init__(self, model_path: str, vision: bool, device: str = "cpu"):
         self.model_path = model_path
+
+        if not os.path.exists(model_path):
+            snapshot_path = snapshot_download(
+                repo_id=model_path,
+                allow_patterns=None,
+                repo_type="model",
+            )
+            model_path = snapshot_path
+
         self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
         self.device = device
 

diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
@@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
         else:
             raise ValueError(
-                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`."
+                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`."
             )
         self.tokenizer = self.engine.tokenizer
 

diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
@@ -28,9 +28,9 @@ class Config(BaseSettings):
     )
     port: int = Field(default=6979, description="Server port.")
     host: str = Field(default="0.0.0.0", description="Server host.")
-    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`")
+    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`")
     backend: str = Field(
-        default="directml", description="Backend engine: `cpu`, `ipex` and `directml`"
+        default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`"
     )
     response_role: str = Field(default="assistant", description="Server response role.")
     uvicorn_log_level: str = Field(