diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 13f265ccf..fdd56c463 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -52,6 +52,23 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. [XPK] Creating ConfigMap for cluster +[XPK] Temp file (0604d72ef175c94fc796d8f02cff009b4241e85d444d22d414a56a47764d7bbb) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x-8: "1" + +[XPK] Temp file (51bf42f3a2eb3734b89e650bc26bead709461fa30865893815a078a04f7d7444) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: SPOT + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -60,6 +77,88 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -75,7 +174,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -185,10 +285,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Cluster_create_for_multi-host_nodepool.txt b/goldens/Cluster_create_for_multi-host_nodepool.txt index bb4a750ea..b35e2d769 100644 --- a/goldens/Cluster_create_for_multi-host_nodepool.txt +++ b/goldens/Cluster_create_for_multi-host_nodepool.txt @@ -54,6 +54,23 @@ gcloud compute resource-policies describe tpu7x-16-2x2x2-placement-policy --proj [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. [XPK] Creating ConfigMap for cluster +[XPK] Temp file (ea18cffcead5f990c8f33d0b4bfb4279e5672bb21acb95618d855f4adf6342ca) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x-16: "2" + +[XPK] Temp file (51bf42f3a2eb3734b89e650bc26bead709461fa30865893815a078a04f7d7444) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: SPOT + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -62,6 +79,88 @@ gcloud compute resource-policies describe tpu7x-16-2x2x2-placement-policy --proj kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -77,7 +176,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (f0a510ac08b9c6d8f549478c49836dca41a72a347c491acac1fa70272d531056) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -187,10 +287,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index 0fc48ea3d..812ed061e 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -59,6 +59,24 @@ kubectl get configmap golden-cluster-private-resources-configmap -o=custom-colum [XPK] Creating ConfigMap for cluster [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Temp file (8669497cfbe494756d36922054f924d7dca463141f0e5d0329e517c880cf2f06) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-private-resources-configmap +data: + v5p-8: "1" + +[XPK] Temp file (f9f6378e33248722c61046af0ef32f4648b322d22bcdf3ba4b604bd5c6ad859f) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-private-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: RESERVATION + reservation_id: golden-reservation + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -67,6 +85,88 @@ gcloud beta compute reservations describe golden-reservation --project=golden-pr kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -82,7 +182,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (1838a525f73d2cfd19aa616665e8b33fcc4ea10ba8d6015a9307109a6be6d372) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -201,10 +302,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt index d72320a15..9f94e3c59 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt @@ -52,6 +52,23 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. [XPK] Creating ConfigMap for cluster +[XPK] Temp file (0604d72ef175c94fc796d8f02cff009b4241e85d444d22d414a56a47764d7bbb) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x-8: "1" + +[XPK] Temp file (51bf42f3a2eb3734b89e650bc26bead709461fa30865893815a078a04f7d7444) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: SPOT + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -60,6 +77,88 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -79,7 +178,8 @@ kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=ava gcloud compute machine-types describe tpu7x-standard-4t --project=golden-project --zone=us-central1-a --format='value(guestCpus,memoryMb)' [XPK] The CPU limit is above the available capacity. We will set CPU limit to 10. [XPK] The memory limit is above the available capacity. We will set memory limit to 10Mi. -[XPK] Applying following Kueue resources: +[XPK] Temp file (1ea1a0b1a0ec540d8320ef2a8378363e692a8439192a8f50c4b77fe545dd0a4c) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -189,10 +289,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt index b5e57c17f..71ab079b7 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt @@ -52,6 +52,23 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Pretending all the jobs succeeded [XPK] Create or delete node pool request complete. [XPK] Creating ConfigMap for cluster +[XPK] Temp file (0604d72ef175c94fc796d8f02cff009b4241e85d444d22d414a56a47764d7bbb) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x-8: "1" + +[XPK] Temp file (51bf42f3a2eb3734b89e650bc26bead709461fa30865893815a078a04f7d7444) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: SPOT + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -60,6 +77,88 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -79,7 +178,8 @@ kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=ava gcloud compute machine-types describe tpu7x-standard-4t --project=golden-project --zone=us-central1-a --format='value(guestCpus,memoryMb)' [XPK] The CPU limit is below the available capacity, which would lead to underutilization. We will set CPU limit to 10. [XPK] The memory limit is below the available capacity, which would lead to underutilization. We will set the memory limit to 10Mi. -[XPK] Applying following Kueue resources: +[XPK] Temp file (1ea1a0b1a0ec540d8320ef2a8378363e692a8439192a8f50c4b77fe545dd0a4c) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -189,10 +289,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index cbc85838c..9d5c37ce2 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -58,6 +58,24 @@ gcloud compute resource-policies describe gb200-4-1x72-placement-policy --projec [XPK] Creating ConfigMap for cluster [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a +[XPK] Temp file (9476f7fa10da99ed4e0797d6d660cda076b5d6dfcd366a9e2560681f82697e99) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + gb200-4: "2" + +[XPK] Temp file (03abe143f4fc53e4cd8a023dec1bfbe6c92dcad72f5a0a230df6d54fb9fe002b) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: RESERVATION + reservation_id: golden-reservation + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -66,6 +84,88 @@ gcloud beta compute reservations describe golden-reservation --project=golden-pr kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -81,7 +181,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (5c5d70f8d2bbedea9acccd9c1a153e2f55efd31cc61d2b55ecdd4a8f009fab11) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -191,10 +292,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] Installing NCCL Plugin for cluster diff --git a/goldens/Cluster_create_with_shared_reservation.txt b/goldens/Cluster_create_with_shared_reservation.txt index f37e35bdd..8737f1fa9 100644 --- a/goldens/Cluster_create_with_shared_reservation.txt +++ b/goldens/Cluster_create_with_shared_reservation.txt @@ -56,6 +56,24 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Creating ConfigMap for cluster [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a +[XPK] Temp file (0604d72ef175c94fc796d8f02cff009b4241e85d444d22d414a56a47764d7bbb) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x-8: "1" + +[XPK] Temp file (8db9ad667fb3658c85db8248984df3ec0a4556ec66b370ca347916a0135a6d59) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: RESERVATION + reservation_id: projects/reservation-project/reservations/golden-reservation + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -64,6 +82,88 @@ gcloud beta compute reservations describe golden-reservation --project=reservati kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -79,7 +179,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -189,10 +290,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index a92230d54..681d5047e 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -54,6 +54,34 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Enabling Autoprovisioning [XPK] Default Chips quota is minimum: 0, maximum: 4. [XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Temp file (6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf) content: + +management: + autoRepair: true + autoUpgrade: true +scopes: + - "https://www.googleapis.com/auth/devstorage.read_write" +autoprovisioningLocations: + - us-central1-a + +resourceLimits: +- resourceType: 'cpu' + + minimum: 1 + maximum: 1000000 + +- resourceType: 'memory' + + minimum: 1 + maximum: 10000000 + + +- resourceType: tpu7x + minimum: 0 + maximum: 4 + + + [XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf [XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. @@ -63,6 +91,25 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- [XPK] Breaking up a total of 0 commands into 0 batches [XPK] Pretending all the jobs succeeded [XPK] Creating ConfigMap for cluster +[XPK] Temp file (bdf76c6250b016c93566ca5b6d43bcdb2fcc36830987ecceb29d8e314a0dc4e5) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x: AUTOPROVISION + minimum_chips: "0" + maximum_chips: "4" + +[XPK] Temp file (90318545b36cafe80f15c3973d9db1c802a9b0f08f3c4ac766a48f1f668ecf80) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: ON_DEMAND + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -71,6 +118,88 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -86,7 +215,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (40a7aac5b047c750ee98477984af3d46acd60d164d852eccd1b47a21c4155f2d) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -196,10 +326,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 432b05b6a..b033cb252 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -55,6 +55,34 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Enabling Autoprovisioning [XPK] Default Chips quota is minimum: 0, maximum: 4. [XPK] Chips quota is minimum: 0, maximum: 4. XPK will autoprovision 4 chips based on incoming workload requests, keeping at least 0 available at all times, and maximum of 4. If the difference (4 chips) is small, rescaling will not work well. +[XPK] Temp file (6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf) content: + +management: + autoRepair: true + autoUpgrade: true +scopes: + - "https://www.googleapis.com/auth/devstorage.read_write" +autoprovisioningLocations: + - us-central1-a + +resourceLimits: +- resourceType: 'cpu' + + minimum: 1 + maximum: 1000000 + +- resourceType: 'memory' + + minimum: 1 + maximum: 10000000 + + +- resourceType: tpu7x + minimum: 0 + maximum: 4 + + + [XPK] Task: `Update cluster with autoprovisioning enabled` is implemented by the following command not running since it is a dry run. gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --enable-autoprovisioning --autoprovisioning-config-file 6062bfee91f21efca86f2c3261129f06b1896ad9b68d2ecdba9589bea9e15ddf [XPK] Task: `Update cluster with autoscaling-profile` is implemented by the following command not running since it is a dry run. @@ -64,6 +92,25 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- [XPK] Breaking up a total of 0 commands into 0 batches [XPK] Pretending all the jobs succeeded [XPK] Creating ConfigMap for cluster +[XPK] Temp file (bdf76c6250b016c93566ca5b6d43bcdb2fcc36830987ecceb29d8e314a0dc4e5) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-resources-configmap +data: + tpu7x: AUTOPROVISION + minimum_chips: "0" + maximum_chips: "4" + +[XPK] Temp file (90318545b36cafe80f15c3973d9db1c802a9b0f08f3c4ac766a48f1f668ecf80) content: +kind: ConfigMap +apiVersion: v1 +metadata: + name: golden-cluster-metadata-configmap +data: + xpk_version: v0.14.3 + capacity_type: ON_DEMAND + [XPK] Breaking up a total of 2 commands into 1 batches [XPK] Pretending all the jobs succeeded [XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available @@ -72,6 +119,88 @@ gcloud beta container node-pools list --cluster golden-cluster --project=golden- kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run. kubectl get node --no-headers | wc -l +[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content: + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: jobset + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: jobset + control-plane: controller-manager + name: jobset-controller-manager + namespace: jobset-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --config=/controller_manager_config.yaml + - --zap-log-level=2 + command: + - /manager + image: registry.k8s.io/jobset/jobset:v0.8.0 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 4096Mi + requests: + cpu: 1000m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + securityContext: + runAsNonRoot: true + serviceAccountName: jobset-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + name: jobset-manager-config + name: manager-config + - name: cert + secret: + defaultMode: 420 + secretName: jobset-webhook-server-cert + [XPK] Try 1: Updating jobset Controller Manager resources [XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run. kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95 @@ -87,7 +216,8 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml [XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run. kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m -[XPK] Applying following Kueue resources: +[XPK] Temp file (f89effb1f55aef327018037d75f743b5c62d59f1f62fddadaaa31f72e5e07bdf) content: + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: @@ -206,10 +336,79 @@ kubectl kjob printcrds | kubectl apply --server-side -f - [XPK] Creating kjob CRDs succeeded [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true +[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content: + + apiVersion: kjobctl.x-k8s.io/v1alpha1 + kind: JobTemplate + metadata: + name: xpk-def-batch + namespace: default + template: + spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + template: + spec: + dnsPolicy: ClusterFirstWithHostNet + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + containers: + - name: xpk-batch-container + image: ubuntu:22.04 + workingDir: / + + + priorityClassName: medium + restartPolicy: OnFailure + serviceAccountName: + [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61 +[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content: + +apiVersion: v1 +kind: PodTemplate +metadata: + name: xpk-def-pod + namespace: default +template: + spec: + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + containers: + - name: xpk-interactive-container + image: busybox:1.28 + command: [/bin/sh] + workingDir: / + initContainers: + - name: init + image: busybox:1.28 + command: ['/bin/mkdir', '-p', '/'] + serviceAccountName: + [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run. kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8 +[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content: + +apiVersion: kjobctl.x-k8s.io/v1alpha1 +kind: ApplicationProfile +metadata: + name: xpk-def-app-profile + namespace: default +spec: + supportedModes: + - name: Slurm + template: xpk-def-batch + requiredFlags: [] + - name: Interactive + template: xpk-def-pod + volumeBundles: [] + [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run. kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486 [XPK] GKE commands done! Resources are created. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 380633b68..e4719acbf 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -15,6 +15,17 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] No gcp parallelstore instances to add detected. [XPK] No gce persistent disk instances to add detected. [XPK] No managed lustre instances to add detected. +[XPK] Temp file (4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688) content: +FROM python:3.10 + + # Set the working directory in the container + WORKDIR /app + + # Copy all files from local workspace into docker container + COPY . . + + WORKDIR /app + [XPK] Building /tmp into docker image. [XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp @@ -23,6 +34,114 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Temp file (e21c8ebdc21d15a852187058c096898c486d3b1066e67dcfb67e5052a1d0a7fa) content: +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: golden-workload + labels: + kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue + xpk.google.com/workload: golden-workload + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment +spec: + ttlSecondsAfterFinished: 43200 + failurePolicy: + rules: + - action: FailJobSet + onJobFailureReasons: + - PodFailurePolicy + maxRestarts: 0 + replicatedJobs: + - name: slice-job + replicas: 1 + template: + spec: + parallelism: 1 # Equal to the number of VMs per slice (or sub-slice). + completions: 1 # Same as the above. + backoffLimit: 0 # When any pod fails, the job is failed + + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: jax-tpu + operator: NotIn + values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255] + template: + metadata: + labels: + xpk.google.com/workload: golden-workload + annotations: + + + spec: + schedulerName: default-scheduler + imagePullSecrets: + - name: None + restartPolicy: Never + + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice + cloud.google.com/gke-tpu-topology: 2x2x1 + + + priorityClassName: medium + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + terminationGracePeriodSeconds: 30 + containers: + - name: jax-tpu + image: gcr.io/golden-project/dry-run-runner:prefix-current + + env: + ports: + - containerPort: 8471 + - containerPort: 8080 + + securityContext: + privileged: true + command: + - bash + - -c + - | + echo XPK Start: $(date); + _sigterm() (kill -SIGTERM $! 2>/dev/null;); + trap _sigterm SIGTERM; + + (bash hello) & PID=$!; + while kill -0 $PID 2>/dev/null; + do sleep 5; + done; + wait $PID; + EXIT_CODE=$?; + + echo XPK End: $(date); + echo EXIT_CODE=$EXIT_CODE; + + + exit $EXIT_CODE + resources: + limits: + google.com/tpu: 4 + + volumeMounts: + - mountPath: /dev/shm + name: dshm-2 + + + serviceAccountName: + tolerations: + + - operator: "Exists" + key: google.com/tpu + + volumes: + - emptyDir: + medium: Memory + name: dshm-2 + + [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. kubectl apply -f e21c8ebdc21d15a852187058c096898c486d3b1066e67dcfb67e5052a1d0a7fa [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index fe352c2c4..a4f2c9f7c 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -17,6 +17,17 @@ kubectl get pods -n pathways-job-system --no-headers -o custom-columns=NAME:.met gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)" [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" +[XPK] Temp file (4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688) content: +FROM python:3.10 + + # Set the working directory in the container + WORKDIR /app + + # Copy all files from local workspace into docker container + COPY . . + + WORKDIR /app + [XPK] Building /tmp into docker image. [XPK] Task: `Building script_dir into docker image` is implemented by the following command not running since it is a dry run. docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94c652303f1dc0fecad085ea0993f688 -t dry-run-runner /tmp @@ -25,6 +36,94 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current +[XPK] Temp file (6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce) content: + + apiVersion: pathways-job.pathways.domain/v1 + kind: PathwaysJob + metadata: + name: golden-workload + labels: + kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue + xpk.google.com/workload: golden-workload + spec: + maxRestarts: 0 + customComponents: + + + + + workers: + - type: ct5p-hightpu-4t + topology: 2x2x1 + numSlices: 1 + maxSliceRestarts: 1 + terminationGracePeriodSeconds: 30 + priorityClassName: medium + nodeSelector: + + + pathwaysDir: gs://cloud-pathways-staging/tmp #This bucket needs to be created in advance. + controller: + # #Pod template for training, default mode. + deploymentMode: default + mainContainerName: jax-tpu + elasticSlices: 0 + template: + + metadata: + spec: + containers: + - name: jax-tpu + image: gcr.io/golden-project/dry-run-runner:prefix-current + imagePullPolicy: Always + env: + ports: + + + securityContext: + privileged: true + command: + - bash + - -c + - | + echo XPK Start: $(date); + _sigterm() (kill -SIGTERM $! 2>/dev/null;); + trap _sigterm SIGTERM; + + (bash hello) & PID=$!; + while kill -0 $PID 2>/dev/null; + do sleep 5; + done; + wait $PID; + EXIT_CODE=$?; + + echo XPK End: $(date); + echo EXIT_CODE=$EXIT_CODE; + + + exit $EXIT_CODE + resources: + limits: + cpu: "24" + memory: 100G + + volumeMounts: + - mountPath: /tmp + name: shared-tmp + + + nodeSelector: + cloud.google.com/gke-nodepool: cpu-np + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + restartPolicy: Never + volumes: + - hostPath: + path: /tmp + type: DirectoryOrCreate + name: shared-tmp + + [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. kubectl apply -f 6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. diff --git a/src/xpk/core/kueue_manager.py b/src/xpk/core/kueue_manager.py index ef7738e39..4344ae45d 100644 --- a/src/xpk/core/kueue_manager.py +++ b/src/xpk/core/kueue_manager.py @@ -22,7 +22,6 @@ from jinja2 import Environment, FileSystemLoader from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained -from ..utils.execution_context import is_dry_run from ..utils.kueue import is_queued_cluster from kubernetes.utils import parse_quantity from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE @@ -439,8 +438,6 @@ def __get_topology_name_and_yaml( def __apply_manifest(self, manifest: str) -> int: task = "Applying Kueue Custom Resources" - if is_dry_run(): - xpk_print(f"Applying following Kueue resources:{manifest}") tmp_file = write_tmp_file(manifest) command = f"kubectl apply -f {tmp_file}" return run_command_with_updates(command, task) diff --git a/src/xpk/utils/file.py b/src/xpk/utils/file.py index f5242e2d3..858fdfbd8 100644 --- a/src/xpk/utils/file.py +++ b/src/xpk/utils/file.py @@ -18,6 +18,7 @@ import os import hashlib from .execution_context import is_dry_run +from .console import xpk_print def make_tmp_files(per_command_name: list[str]) -> list[str]: @@ -51,7 +52,9 @@ def write_tmp_file(payload: str) -> str: A file object that was written to. """ if is_dry_run(): - return _hash_filename(payload) + name = _hash_filename(payload) + xpk_print(f'Temp file ({name}) content: \n{payload}') + return name with tempfile.NamedTemporaryFile(delete=False) as tmp: with open(file=tmp.name, mode='w', encoding='utf=8') as f: