Skip to content

Commit bdb7d70

Browse files
ChughShilpasutaakar
authored andcommitted
Add AMD GPU test for ray clusters
1 parent b612ce3 commit bdb7d70

7 files changed

+111
-69
lines changed

.github/workflows/e2e_tests.yaml

+5-1
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,15 @@ jobs:
8989
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
9090
9191
set -euo pipefail
92-
go test -timeout 120m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
92+
go test -timeout 120m -v -run TestMnistRayJobRayClusterAppWrapperCudaGpu ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
9393
9494
- name: Print CodeFlare operator logs
9595
if: always() && steps.deploy.outcome == 'success'
9696
run: |
97+
kubectl get resourceflavor -o yaml
98+
kubectl get clusterqueue -o yaml
99+
kubectl get workload -A -o yaml
100+
kubectl get raycluster -A -o yaml
97101
echo "Printing CodeFlare operator logs"
98102
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
99103

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ require (
1111
github.com/openshift/api v0.0.0-20240904015708-69df64132c91
1212
github.com/openshift/client-go v0.0.0-20240904130219-3795e907a202
1313
github.com/project-codeflare/appwrapper v1.0.4
14-
github.com/project-codeflare/codeflare-common v0.0.0-20250306164418-eb812487be82
14+
github.com/project-codeflare/codeflare-common v0.0.0-20250317102908-1c124db97844
1515
github.com/ray-project/kuberay/ray-operator v1.2.2
1616
go.uber.org/zap v1.27.0
1717
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
225225
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
226226
github.com/project-codeflare/appwrapper v1.0.4 h1:364zQLX0tsi4LvBBYNKZL7PPbNWPbVU7vK6+/kVV/FQ=
227227
github.com/project-codeflare/appwrapper v1.0.4/go.mod h1:A1b6bMFNMX5Btv3ckgeuAHVVZzp1G30pSBe6BE/xJWE=
228-
github.com/project-codeflare/codeflare-common v0.0.0-20250306164418-eb812487be82 h1:cL1K2+r1lJVwBkhXiVFr2A9DphnylJmilYDIqg/W62M=
229-
github.com/project-codeflare/codeflare-common v0.0.0-20250306164418-eb812487be82/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
228+
github.com/project-codeflare/codeflare-common v0.0.0-20250317102908-1c124db97844 h1:hEjZ2pV4Fp81wytijJZ7uHWovKIqirVBA/t1F5hIrbA=
229+
github.com/project-codeflare/codeflare-common v0.0.0-20250317102908-1c124db97844/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
230230
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
231231
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
232232
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=

test/e2e/deployment_appwrapper_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func TestDeploymentAppWrapper(t *testing.T) {
4545
defer func() {
4646
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
4747
}()
48-
clusterQueue := createClusterQueue(test, resourceFlavor, 0)
48+
clusterQueue := createClusterQueue(test, resourceFlavor, CPU)
4949
defer func() {
5050
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
5151
}()

test/e2e/job_appwrapper_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func TestBatchJobAppWrapper(t *testing.T) {
4343
defer func() {
4444
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
4545
}()
46-
clusterQueue := createClusterQueue(test, resourceFlavor, 0)
46+
clusterQueue := createClusterQueue(test, resourceFlavor, CPU)
4747
defer func() {
4848
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
4949
}()

test/e2e/mnist_pytorch_appwrapper_test.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,15 @@ import (
3232
)
3333

3434
func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
35-
runMnistPyTorchAppWrapper(t, "cpu", 0)
35+
runMnistPyTorchAppWrapper(t, CPU)
3636
}
3737

3838
func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
39-
runMnistPyTorchAppWrapper(t, "gpu", 1)
39+
runMnistPyTorchAppWrapper(t, NVIDIA)
4040
}
4141

4242
// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
43-
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
43+
func runMnistPyTorchAppWrapper(t *testing.T, accelerator Accelerator) {
4444
test := With(t)
4545

4646
// Create a namespace
@@ -51,7 +51,7 @@ func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus in
5151
defer func() {
5252
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
5353
}()
54-
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
54+
clusterQueue := createClusterQueue(test, resourceFlavor, accelerator)
5555
defer func() {
5656
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
5757
}()
@@ -109,7 +109,7 @@ func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus in
109109
{Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()},
110110
{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
111111
{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
112-
{Name: "ACCELERATOR", Value: accelerator},
112+
{Name: "ACCELERATOR", Value: accelerator.Type},
113113
},
114114
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
115115
VolumeMounts: []corev1.VolumeMount{

0 commit comments

Comments
 (0)