Skip to content

Commit

Permalink
[Feature] Improve the observability of integration tests (ray-project…
Browse files Browse the repository at this point in the history
…#775)

Show debug information in failed test cases.
Make operator label consistent across Kustomize and Helm deployment methods.
  • Loading branch information
jasoonn authored and DmitriGekhtman committed Dec 6, 2022
1 parent f0abc1d commit 7b97461
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
1 change: 1 addition & 0 deletions helm-chart/kuberay-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ spec:
labels:
app.kubernetes.io/name: {{ include "kuberay-operator.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/component: kuberay-operator
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
Expand Down
37 changes: 22 additions & 15 deletions tests/compatibility-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
K8S_CLUSTER_MANAGER,
OperatorManager,
RuleSet,
shell_subprocess_run
shell_subprocess_run,
show_cluster_info
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -177,16 +178,11 @@ def test_kill_head(self):
rtn = shell_subprocess_run(
'kubectl wait --for=condition=ready pod -l rayCluster=raycluster-compatibility-test --all --timeout=900s', check = False)
if rtn != 0:
shell_subprocess_run('kubectl get pods -A')
shell_subprocess_run(
'kubectl describe pod $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
shell_subprocess_run(
'kubectl logs $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
shell_subprocess_run(
'kubectl logs -n $(kubectl get pods -A | grep -e "-operator" | awk \'{print $1 " " $2}\')')
assert rtn == 0
show_cluster_info("default")
raise Exception(f"Nonzero return code {rtn} in test_kill_head()")

def test_ray_serve(self):
cluster_namespace = "default"
docker_client = docker.from_env()
container = docker_client.containers.run(ray_image, remove=True, detach=True, stdin_open=True, tty=True,
network_mode='host', command=["/bin/sh"])
Expand All @@ -198,12 +194,14 @@ def test_ray_serve(self):
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_ray_serve_1.py {ray_namespace}', timeout_sec = 180)

if exit_code != 0:
show_cluster_info(cluster_namespace)
raise Exception(f"There was an exception during the execution of test_ray_serve_1.py. The exit code is {exit_code}." +
"See above for command output. The output will be printed by the function exec_run_container.")

# KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
# if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
headpods = utils.get_pod(namespace='default', label_selector='ray.io/node-type=head')
headpods = utils.get_pod(namespace=cluster_namespace,
label_selector='ray.io/node-type=head')
assert(len(headpods.items) == 1)
old_head_pod = headpods.items[0]
old_head_pod_name = old_head_pod.metadata.name
Expand All @@ -212,23 +210,27 @@ def test_ray_serve(self):
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
# will terminate.
exec_command = ['pkill gcs_server']
utils.pod_exec_command(pod_name=old_head_pod_name, namespace='default', exec_command=exec_command)
utils.pod_exec_command(pod_name=old_head_pod_name,
namespace=cluster_namespace, exec_command=exec_command)

# Waiting for all pods become ready and running.
utils.wait_for_new_head(old_head_pod_name, restart_count, 'default', timeout=300, retry_interval_ms=1000)
utils.wait_for_new_head(old_head_pod_name, restart_count,
cluster_namespace, timeout=300, retry_interval_ms=1000)

# Try to connect to the deployed model again
utils.copy_to_container(container, 'tests/scripts', '/usr/local/', 'test_ray_serve_2.py')
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_ray_serve_2.py {ray_namespace}', timeout_sec = 180)

if exit_code != 0:
show_cluster_info(cluster_namespace)
raise Exception(f"There was an exception during the execution of test_ray_serve_2.py. The exit code is {exit_code}." +
"See above for command output. The output will be printed by the function exec_run_container.")

container.stop()
docker_client.close()

def test_detached_actor(self):
cluster_namespace = "default"
docker_client = docker.from_env()
container = docker_client.containers.run(ray_image, remove=True, detach=True, stdin_open=True, tty=True,
network_mode='host', command=["/bin/sh"])
Expand All @@ -240,12 +242,14 @@ def test_detached_actor(self):
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_detached_actor_1.py {ray_namespace}', timeout_sec = 180)

if exit_code != 0:
show_cluster_info(cluster_namespace)
raise Exception(f"There was an exception during the execution of test_detached_actor_1.py. The exit code is {exit_code}." +
"See above for command output. The output will be printed by the function exec_run_container.")

# KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
# if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
headpods = utils.get_pod(namespace='default', label_selector='ray.io/node-type=head')
headpods = utils.get_pod(namespace=cluster_namespace,
label_selector='ray.io/node-type=head')
assert(len(headpods.items) == 1)
old_head_pod = headpods.items[0]
old_head_pod_name = old_head_pod.metadata.name
Expand All @@ -254,10 +258,12 @@ def test_detached_actor(self):
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
# will terminate.
exec_command = ['pkill gcs_server']
utils.pod_exec_command(pod_name=old_head_pod_name, namespace='default', exec_command=exec_command)
utils.pod_exec_command(pod_name=old_head_pod_name,
namespace=cluster_namespace, exec_command=exec_command)

# Waiting for all pods become ready and running.
utils.wait_for_new_head(old_head_pod_name, restart_count, 'default', timeout=300, retry_interval_ms=1000)
utils.wait_for_new_head(old_head_pod_name, restart_count,
cluster_namespace, timeout=300, retry_interval_ms=1000)

# Try to connect to the detached actor again.
# [Note] When all pods become running and ready, the RayCluster still needs tens of seconds to relaunch actors. Hence,
Expand All @@ -266,6 +272,7 @@ def test_detached_actor(self):
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_detached_actor_2.py {ray_namespace}', timeout_sec = 180)

if exit_code != 0:
show_cluster_info(cluster_namespace)
raise Exception(f"There was an exception during the execution of test_detached_actor_2.py. The exit code is {exit_code}." +
"See above for command output. The output will be printed by the function exec_run_container.")

Expand Down

0 comments on commit 7b97461

Please sign in to comment.