diff --git a/README.md b/README.md
index 50379d6..57743e1 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 ## Table Content
 
 - [Supported Models](#supported-models-quick-start)
-  - [Onnxruntime Models](./docs/model/onnxruntime_models.md)
+  - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md)
+  - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md)
   - [Ipex-LLM Models](./docs/model/ipex_models.md)
 - [Getting Started](#getting-started)
   - [Installation From Source](#installation)
@@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
 | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
 | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) |
 | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
 | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
 | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md
new file mode 100644
index 0000000..6951ac8
--- /dev/null
+++ b/docs/model/onnxruntime_cpu_models.md
@@ -0,0 +1,14 @@
+# Model Powered by Onnxruntime CPU GenAI
+
+## Supported Models
+
+| Model Name                                            | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32    | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32   | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32  | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32  | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md
new file mode 100644
index 0000000..0f6a3a3
--- /dev/null
+++ b/docs/model/onnxruntime_directml_models.md
@@ -0,0 +1,19 @@
+# Model Powered by Onnxruntime DirectML GenAI
+
+## Supported Models
+
+| Model Name                                 | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-directml       | 3.8B       | 4096           | 1.989     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) |
+| Phi-3-mini-128k-instruct-onnx-directml      | 3.8B       | 131072           | 2.018     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml)  |
+| Phi-3-medium-4k-instruct-onnx-directml      | 17B        | 4096           | 6.987     | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml)  |
+| Phi-3-medium-128k-instruct-onnx-directml    | 17B        | 131072           | 7.025     | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) |
+| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B     | 4096           | 2.137     | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) |
+| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B  | 32768          | 3.988     | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) |
+| gemma-2b-it-int4-onnx-directml              | 2B         | 8192           | 2.314     | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml)                      |
+| gemma-7b-it-int4-onnx-directml              | 7B         | 8192           | 5.958     | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml)                      |
+| llama-2-7b-chat-int4-onnx-directml          | 7B         | 4096           | 3.708     | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml)              |
+| Starling-LM-7b-beta-int4-onnx-directml      | 7B         | 8192           | 3.974     | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml)     |
+| openchat-3.6-8b-20240522-int4-onnx-directml | 8B         | 8192           | 4.922     | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) |
+| Yi-1.5-6B-Chat-int4-onnx-directml           | 6B         | 32768          | 3.532     | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml)  |
+
diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md
deleted file mode 100644
index 4d61ffe..0000000
--- a/docs/model/onnxruntime_models.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Model Powered by Onnxruntime GenAI
-
-## Supported Models
-
-| Models | Parameters | Context Length | Link |
-| --- | --- | --- | --- |
-| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
-| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
-| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
-| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
-| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
-| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
-| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
-| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
-| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
-| Phi-3-vision-128k-instruct |  | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |
diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py
index 82b5dca..95d13c3 100644
--- a/src/embeddedllm/backend/onnxruntime_engine.py
+++ b/src/embeddedllm/backend/onnxruntime_engine.py
@@ -1,9 +1,11 @@
 # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor
 import contextlib
 import time
+import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import AsyncIterator, List, Optional
+from huggingface_hub import snapshot_download
 
 import onnxruntime_genai as og
 from loguru import logger
@@ -39,6 +41,15 @@ def onnx_generator_context(model, params):
 class OnnxruntimeEngine(BaseLLMEngine):
     def __init__(self, model_path: str, vision: bool, device: str = "cpu"):
         self.model_path = model_path
+
+        if not os.path.exists(model_path):
+            snapshot_path = snapshot_download(
+                repo_id=model_path,
+                allow_patterns=None,
+                repo_type="model",
+            )
+            model_path = snapshot_path
+
         self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
         self.device = device
 
diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
index 3eac11c..86f589c 100644
--- a/src/embeddedllm/engine.py
+++ b/src/embeddedllm/engine.py
@@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
         else:
             raise ValueError(
-                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`."
+                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`."
             )
         self.tokenizer = self.engine.tokenizer
 
diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
index 9385f24..efc2916 100644
--- a/src/embeddedllm/entrypoints/api_server.py
+++ b/src/embeddedllm/entrypoints/api_server.py
@@ -28,9 +28,9 @@ class Config(BaseSettings):
     )
     port: int = Field(default=6979, description="Server port.")
     host: str = Field(default="0.0.0.0", description="Server host.")
-    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`")
+    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`")
     backend: str = Field(
-        default="directml", description="Backend engine: `cpu`, `ipex` and `directml`"
+        default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`"
     )
     response_role: str = Field(default="assistant", description="Server response role.")
     uvicorn_log_level: str = Field(
diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index ca1da44..eb7878a 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -65,44 +65,103 @@ class ModelCard(BaseModel):
     size: Optional[int] = 0
 
 
-dml_model_dict_list = {
+ipex_model_dict_list = {
     "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="directml/directml-int4-awq-block-128",
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
+        repo_id="microsoft/Phi-3-mini-4k-instruct",
+        model_name="Phi-3-mini-4k-instruct",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
-        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx",
-        model_name="Phi-3-mini-4k-instruct-062024-onnx",
-        subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
+    "microsoft/Phi-3-mini-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-mini-128k-instruct",
+        model_name="Phi-3-mini-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "microsoft/Phi-3-medium-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-4k-instruct",
+        model_name="Phi-3-medium-4k-instruct",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-onnx",
-        subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
+    "microsoft/Phi-3-medium-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-128k-instruct",
+        model_name="Phi-3-medium-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+}
+
+dml_model_dict_list = {
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml",
+        model_name="Phi-3-mini-128k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml",
+        model_name="Phi-3-medium-4k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml",
+        model_name="Phi-3-medium-128k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
+        model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/gemma-2b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4",
-        repo_id="EmbeddedLLM/gemma-2b-it-onnx",
-        model_name="gemma-2b-it-int4",
-        subfolder="onnx/directml/gemma-2b-it-int4",
+    "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml",
+        model_name="gemma-2b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/gemma-7b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4",
-        repo_id="EmbeddedLLM/gemma-7b-it-onnx",
-        model_name="gemma-7b-it-int4",
-        subfolder="onnx/directml/gemma-7b-it-int4",
+    "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml",
+        model_name="gemma-7b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -114,70 +173,94 @@ class ModelCard(BaseModel):
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4",
-        repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx",
-        model_name="Starling-LM-7b-beta-int4",
-        subfolder="onnx/directml/Starling-LM-7b-beta-int4",
+    "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml",
+        model_name="Starling-LM-7b-beta-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-int4",
-        subfolder="onnx/directml/openchat-3.6-8b-20240522-int4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml",
+        model_name="openchat-3.6-8b-20240522-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
-        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx",
-        model_name="01-ai_Yi-1.5-6B-Chat-int4",
-        subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
+    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
 }
 
-cpu_model_dict_list = {
-    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+onnx_cpu_model_dict_list = {
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -221,7 +304,7 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
     return bytes_to_gb(total_size_bytes)
 
 
-for k, v in cpu_model_dict_list.items():
+for k, v in onnx_cpu_model_dict_list.items():
     v.size = compute_memory_size(
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
@@ -231,8 +314,13 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
 
+for k, v in ipex_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
+
 
-def convert_to_dataframe(dml_model_dict_list):
+def convert_to_dataframe(model_dict_list):
     # Create lists to store the data
     model_names = []
     hf_urls = []
@@ -244,7 +332,7 @@ def convert_to_dataframe(dml_model_dict_list):
     context_lengths = []
 
     # Iterate through the dictionary and extract the data
-    for key, model_card in dml_model_dict_list.items():
+    for key, model_card in model_dict_list.items():
         model_names.append(key)
         hf_urls.append(model_card.hf_url)
         repo_ids.append(model_card.repo_id)
@@ -318,9 +406,12 @@ def update_model_list(engine_type):
     if engine_type == "DirectML":
         models = sorted(list(dml_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(dml_model_dict_list)
+    elif backend == "ipex":
+        models = sorted(list(ipex_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(ipex_model_dict_list)
     else:
-        models = sorted(list(cpu_model_dict_list.keys()))
-        models_pandas = convert_to_dataframe(cpu_model_dict_list)
+        models = sorted(list(onnx_cpu_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(onnx_cpu_model_dict_list)
 
     return gr.Dropdown(choices=models, value=models[0] if models else None), gr.Dataframe(
         value=models_pandas if len(models_pandas) > 0 else None, datatype="markdown"
@@ -340,28 +431,48 @@ def deploy_model(engine_type, model_name, port_number):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif backend == "ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
     else:
-        llm_model_card = cpu_model_dict_list[model_name]
+        llm_model_card = onnx_cpu_model_dict_list[model_name]
 
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
 
-    model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    if llm_model_card.subfolder != ".":
+        model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    else:
+        model_path = snapshot_path
+
+    print("Model path:", model_path)
+
+    if engine_type == "Ipex":
+        device = "xpu"
+
+    else:
+        device = "cpu"
 
     deployed_model.process = subprocess.Popen(
         [
             "ellm_server",
             "--model_path",
             model_path,
+            "--backend",
+            backend,
+            "--device",
+            device,
             "--port",
             f"{port_number}",
             "--served_model_name",
-            model_name,
+            model_name
         ]
     )
+
     deployed_model.model_name = model_name
 
     while True:
@@ -375,6 +486,7 @@ def deploy_model(engine_type, model_name, port_number):
         <p style="color: #2D2363;"><strong>Model:</strong> {model_name}</p>
         <p style="color: #2D2363;"><strong>Engine:</strong> {engine_type}</p>
         <p style="color: #2D2363;"><strong>Port:</strong> {port_number}</p>
+        <p style="color: #2D2363;"><strong>Model Path:</strong> {model_path}</p>
     </div>
     """
 
@@ -402,8 +514,10 @@ def download_model(engine_type, model_name):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif backend == "ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
     else:
-        llm_model_card = cpu_model_dict_list[model_name]
+        llm_model_card = onnx_cpu_model_dict_list[model_name]
 
     # Handle model_name if it's a list
     if isinstance(model_name, list):
@@ -412,7 +526,9 @@ def download_model(engine_type, model_name):
     yield "Downloading ..."
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
     yield snapshot_path
@@ -443,9 +559,20 @@ def main():
         with gr.Accordion("See More Model Details", open=False):
             model_info_pandas_frame = gr.Dataframe(value=None)
 
+        # Default is CPU
+        default_value = "CPU"  
+        default_choices = ["CPU"]
+
+        if backend == "directml":
+            default_value = "DirectML"
+        elif backend == "ipex":
+            default_value = "Ipex"
+
+        default_choices.append(default_value)
+
         selected_engine_type = gr.Dropdown(
-            choices=["DirectML", "CPU"],
-            value="DirectML" if backend == "directml" else "CPU",
+            choices=default_choices,
+            value=default_value,
             multiselect=False,
             label="LLM Engine",
             show_label=True,
diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py
index 9797d05..8f05498 100644
--- a/src/embeddedllm/inputs.py
+++ b/src/embeddedllm/inputs.py
@@ -23,13 +23,13 @@ class ImagePixelData(TypedDict):
 
 # https://github.com/vllm-project/vllm/pull/4028
 @overload
-def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
-    ...
+def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ...
 
 
 @overload
-def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
-    ...
+def parse_and_batch_prompt(
+    prompt: Union[List[int], List[List[int]]]
+) -> Sequence[ParsedTokens]: ...
 
 
 def parse_and_batch_prompt(