diff --git a/applications/embedding/.gitignore b/applications/embedding/.gitignore new file mode 100644 index 0000000..4764510 --- /dev/null +++ b/applications/embedding/.gitignore @@ -0,0 +1,3 @@ +charts +Chart.lock +local-secrets \ No newline at end of file diff --git a/applications/embedding/Chart.yaml b/applications/embedding/Chart.yaml new file mode 100644 index 0000000..247e223 --- /dev/null +++ b/applications/embedding/Chart.yaml @@ -0,0 +1,8 @@ +apiVersion: v2 +name: embedding +version: 0.1.6 +dependencies: + - name: vllm-stack + alias: vllm + version: 0.1.8 + repository: https://vllm-project.github.io/production-stack diff --git a/applications/embedding/gpu-test.yml b/applications/embedding/gpu-test.yml new file mode 100644 index 0000000..e61d2f8 --- /dev/null +++ b/applications/embedding/gpu-test.yml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-test +spec: + containers: + - args: + - /bin/bash + - -c + - nvidia-smi && sleep infinity + image: nvidia/cuda:12.2.0-runtime-ubuntu22.04 + name: gpu-test + resources: + limits: + nvidia.com/gpu: 2 + tolerations: + - key: "node-role.kubernetes.io/gpu" + operator: "Exists" + nodeSelector: + nvidia.com/vgpu.present: "true" \ No newline at end of file diff --git a/applications/embedding/templates/metrics-service.yml b/applications/embedding/templates/metrics-service.yml new file mode 100644 index 0000000..0ca9b17 --- /dev/null +++ b/applications/embedding/templates/metrics-service.yml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vllm-router-metrics + namespace: vllm + labels: + prometheus: prometheus + release: prometheus-stack +spec: + selector: + matchLabels: + environment: router + release: router + namespaceSelector: + matchNames: + - vllm + endpoints: + - port: router-sport + interval: 15s + path: /metrics diff --git a/applications/embedding/values.yaml b/applications/embedding/values.yaml new file mode 100644 index 0000000..f4de93b --- /dev/null +++ b/applications/embedding/values.yaml @@ -0,0 +1,63 @@ +vllm: + servingEngineSpec: + strategy: + # We only have one GPU node, so we need to kill exiting deployment first. + type: Recreate + + runtimeClassName: "" + modelSpec: + - name: "embeddings" + repository: "vllm/vllm-openai" + tag: "v0.11.2" +# tag: "v0.10.2" +# tag: "v0.10.1.1" + + modelURL: "intfloat/multilingual-e5-large" + + replicaCount: 1 + + requestCPU: 2 + requestMemory: "16Gi" + requestGPU: 1 + + pvcStorage: "10Gi" + + vllmConfig: + enableChunkedPrefill: false + enablePrefixCaching: false + maxModelLen: 512 + extraArgs: [ +# "--dtype", "float16", + "--max-num-seqs", "512", + "--max-num-batched-tokens", "32768", + "--gpu-memory-utilization", "0.85", + "--disable-log-requests" + ] + + lmcacheConfig: + enabled: false + cpuOffloadingBufferSize: "20" + + hf_token: + secretName: "hf-secret" + secretKey: "TOKEN" + + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + - "gpu2" + + vllmApiKey: + secretName: "vllm-secret" + secretKey: "KEY" + + tolerations: + - key: "node-role.kubernetes.io/gpu" + operator: "Exists" + + routerSpec: + repository: "lmcache/lmstack-router" + tag: "latest" + imagePullPolicy: "Always"