Azure · r2k1 · May 7, 2026 · May 8, 2026 · May 12, 2026 · May 13, 2026
@@ -13,11 +13,6 @@ import (
 )
 
 const (
-	// cseEventsDir is the directory where CSE task timing events are stored on the VM.
-	// This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh.
-	// Events are written directly here (not in per-handler subdirectories) — each file
-	// is a single-line JSON object named <epoch-ms>.json.
-	cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/"
 	// provisionJSONPath is the path to the provision.json file with overall boot timing.
 	provisionJSONPath = "/var/log/azure/aks/provision.json"
 )
@@ -33,23 +28,23 @@ type CSETaskTiming struct {
 
 // CSEProvisionTiming represents the overall provisioning timing from provision.json.
 type CSEProvisionTiming struct {
-	ExitCode              string `json:"ExitCode"`
-	ExecDuration          string `json:"ExecDuration"`
-	KernelStartTime       string `json:"KernelStartTime"`
-	CloudInitLocalStart   string `json:"CloudInitLocalStartTime"`
-	CloudInitStart        string `json:"CloudInitStartTime"`
-	CloudFinalStart       string `json:"CloudFinalStartTime"`
-	CSEStartTime          string `json:"CSEStartTime"`
-	GuestAgentStartTime   string `json:"GuestAgentStartTime"`
-	SystemdSummary        string `json:"SystemdSummary"`
-	BootDatapoints        json.RawMessage `json:"BootDatapoints"`
+	ExitCode            string          `json:"ExitCode"`
+	ExecDuration        string          `json:"ExecDuration"`
+	KernelStartTime     string          `json:"KernelStartTime"`
+	CloudInitLocalStart string          `json:"CloudInitLocalStartTime"`
+	CloudInitStart      string          `json:"CloudInitStartTime"`
+	CloudFinalStart     string          `json:"CloudFinalStartTime"`
+	CSEStartTime        string          `json:"CSEStartTime"`
+	GuestAgentStartTime string          `json:"GuestAgentStartTime"`
+	SystemdSummary      string          `json:"SystemdSummary"`
+	BootDatapoints      json.RawMessage `json:"BootDatapoints"`
 }
 
 // CSETimingReport holds all parsed timing data from a VM.
 type CSETimingReport struct {
-	Tasks      []CSETaskTiming
-	Provision  *CSEProvisionTiming
-	taskIndex  map[string]*CSETaskTiming
+	Tasks     []CSETaskTiming
+	Provision *CSEProvisionTiming
+	taskIndex map[string]*CSETaskTiming
 }
 
 // cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh.
@@ -128,12 +123,9 @@ func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, erro
 
 	// Read all event JSON files from the CSE events directory, explicitly
 	// appending a newline after each file so each JSON document is separated.
-	// Search both the primary events directory and any handler-version subdirectories,
-	// as the Guest Agent may move events between these locations.
-	listCmd := fmt.Sprintf(
-		"sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null",
-		cseEventsDir,
-	)
+	// Search the CustomScript directory tree for any events/ subdirectories,
+	// as the Guest Agent may store events in handler-version subdirectories.
+	listCmd := "sudo find /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null"
 	result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd)
 	if err != nil {
 		return nil, fmt.Errorf("failed to read CSE events: %w", err)

@@ -74,7 +74,7 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana
 	}, nil
 }
 
-func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace string, labelSelector string, fieldSelector string, maxRetries int) (*corev1.Pod, error) {
+func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
 	defer toolkit.LogStepCtxf(ctx, "waiting for pod %s %s in %q namespace", labelSelector, fieldSelector, namespace)()
 	var pod *corev1.Pod
 
@@ -101,22 +101,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
 			}
 		}
 
-		// Check for FailedCreatePodSandBox events
-		events, err := k.Typed.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + pod.Name})
-		if err == nil {
-			for _, event := range events.Items {
-				if event.Reason == "FailedCreatePodSandBox" {
-					maxRetries--
-					sandboxErr := fmt.Errorf("pod %s has FailedCreatePodSandBox event: %s", pod.Name, event.Message)
-					if maxRetries <= 0 {
-						return false, sandboxErr
-					}
-					k.Typed.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: to.Ptr(int64(0))})
-					return false, nil // Keep polling
-				}
-			}
-		}
-
 		switch pod.Status.Phase {
 		case corev1.PodFailed:
 			logPodDebugInfo(ctx, k, pod)
@@ -142,10 +126,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
 	return pod, err
 }
 
-func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
-	return k.WaitUntilPodRunningWithRetry(ctx, namespace, labelSelector, fieldSelector, 0)
-}
-
 func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
 	defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
 	var lastNode *corev1.Node
@@ -199,7 +179,7 @@ func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeN
 	if kubeNodeName == "" {
 		return nil, fmt.Errorf("kubeNodeName must not be empty")
 	}
-	return k.WaitUntilPodRunningWithRetry(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName, 3)
+	return k.WaitUntilPodRunning(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName)
 }
 
 func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) {

@@ -10,7 +10,6 @@ import (
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/Azure/agentbaker/e2e/toolkit"
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -269,15 +268,27 @@ func getIPTablesRulesCompatibleWithEBPFHostRouting() (map[string][]string, []str
 }
 
 // validateWireServerBlocked checks that unprivileged pods cannot reach WireServer.
-// The iptables FORWARD DROP rules blocking pod→WireServer traffic can be transiently
-// absent when kube-proxy or CNI flush/recreate iptables chains during node setup.
-// We resolve the debug pod once up front (outside the retry budget) so that pod
-// scheduling latency doesn't eat into the iptables-check timeout.
+// Wireserver must never be reachable from pods — any successful connection is a
+// security issue, not a transient condition to retry through.
+//
+// We accept two curl exit codes as evidence of a working block:
+//
+//	28 = operation timeout   (FORWARD DROP — packets silently dropped)
+//	 7 = couldn't connect    (FORWARD REJECT — RST / ICMP unreachable)
+//
+// Any other exit code is suspicious and fails the test with full diagnostics:
+//
+//	  0  = wireserver reachable (security regression)
+//	127  = curl missing from debug image (test would otherwise silently bypass)
+//	2/3  = invalid curl args
+//	  6  = DNS resolution issue (wireserver IP is literal — should not happen)
+//
+// We do retry transient kube-apiserver exec hiccups, but never on the curl
+// result itself — a single observation of an unexpected exit code is enough
+// to fail loudly.
 func validateWireServerBlocked(ctx context.Context, s *Scenario) {
 	defer toolkit.LogStep(s.T, "validating wireserver is blocked from unprivileged pods")()
 
-	// Resolve the unprivileged debug pod once — this can take 25-30s on cold nodes.
-	// Using the parent context so it has the full scenario timeout, not the short poll timeout.
 	nonHostPod, err := s.Runtime.Cluster.Kube.GetPodNetworkDebugPodForNode(ctx, s.Runtime.VM.KubeName)
 	require.NoError(s.T, err, "failed to get non host debug pod for wireserver validation")
 
@@ -297,30 +308,37 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {
 		},
 	}
 
+	allowedExitCodes := map[string]bool{"28": true, "7": true}
+
 	for _, check := range checks {
-		var lastResult *podExecResult
-		err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) {
-			execResult, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
+		var execResult *podExecResult
+		pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
+			r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
 			if execErr != nil {
 				s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
 				return false, nil
 			}
-			lastResult = execResult
-			if lastResult.exitCode == "28" {
-				return true, nil
-			}
-			s.T.Logf("wireserver check %q: expected exit code 28, got %s (retrying)", check.desc, lastResult.exitCode)
-			return false, nil
+			execResult = r
+			return true, nil
 		})
-		if err != nil {
-			s.T.Logf("host IPTABLES: %s", execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String())
-			if lastResult == nil {
-				require.NoErrorf(s.T, err, "curl to %s did not complete before polling stopped", check.desc)
-			}
-			s.T.Logf("last curl result for %s: %s", check.desc, lastResult.String())
-			assert.Equal(s.T, "28", lastResult.exitCode, "curl to %s expected to fail with timeout, but it didn't after retries", check.desc)
-			s.T.FailNow()
+		require.NoErrorf(s.T, pollErr, "wireserver check %q: exec failed after retries", check.desc)
+
+		if allowedExitCodes[execResult.exitCode] {
+			continue
 		}
+
+		iptablesFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String()
+		iptablesKubeFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L KUBE-FORWARD -v -n --line-numbers 2>/dev/null || echo 'chain not found'").String()
+		iptablesSave := execScriptOnVMForScenario(ctx, s, "sudo iptables-save -t filter 2>/dev/null | head -80").String()
+		conntrack := execScriptOnVMForScenario(ctx, s, "sudo conntrack -L -d 168.63.129.16 2>/dev/null || echo 'conntrack not available'").String()
+		s.T.Fatalf("wireserver check %q: unexpected curl exit code %q (want 28 timeout or 7 refused)\n"+
+			"stdout=%q, stderr=%q\n"+
+			"FORWARD chain:\n%s\n"+
+			"KUBE-FORWARD chain:\n%s\n"+
+			"iptables-save filter:\n%s\n"+
+			"conntrack:\n%s",
+			check.desc, execResult.exitCode, execResult.stdout, execResult.stderr,
+			iptablesFwd, iptablesKubeFwd, iptablesSave, conntrack)
 	}
 }