Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 16 additions & 24 deletions e2e/cse_timing.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ import (
)

const (
// cseEventsDir is the directory where CSE task timing events are stored on the VM.
// This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh.
// Events are written directly here (not in per-handler subdirectories) — each file
// is a single-line JSON object named <epoch-ms>.json.
cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/"
// provisionJSONPath is the path to the provision.json file with overall boot timing.
provisionJSONPath = "/var/log/azure/aks/provision.json"
)
Expand All @@ -33,23 +28,23 @@ type CSETaskTiming struct {

// CSEProvisionTiming represents the overall provisioning timing from provision.json.
type CSEProvisionTiming struct {
ExitCode string `json:"ExitCode"`
ExecDuration string `json:"ExecDuration"`
KernelStartTime string `json:"KernelStartTime"`
CloudInitLocalStart string `json:"CloudInitLocalStartTime"`
CloudInitStart string `json:"CloudInitStartTime"`
CloudFinalStart string `json:"CloudFinalStartTime"`
CSEStartTime string `json:"CSEStartTime"`
GuestAgentStartTime string `json:"GuestAgentStartTime"`
SystemdSummary string `json:"SystemdSummary"`
BootDatapoints json.RawMessage `json:"BootDatapoints"`
ExitCode string `json:"ExitCode"`
ExecDuration string `json:"ExecDuration"`
KernelStartTime string `json:"KernelStartTime"`
CloudInitLocalStart string `json:"CloudInitLocalStartTime"`
CloudInitStart string `json:"CloudInitStartTime"`
CloudFinalStart string `json:"CloudFinalStartTime"`
CSEStartTime string `json:"CSEStartTime"`
GuestAgentStartTime string `json:"GuestAgentStartTime"`
SystemdSummary string `json:"SystemdSummary"`
BootDatapoints json.RawMessage `json:"BootDatapoints"`
}

// CSETimingReport holds all parsed timing data from a VM.
type CSETimingReport struct {
Tasks []CSETaskTiming
Provision *CSEProvisionTiming
taskIndex map[string]*CSETaskTiming
Tasks []CSETaskTiming
Provision *CSEProvisionTiming
taskIndex map[string]*CSETaskTiming
}

// cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh.
Expand Down Expand Up @@ -128,12 +123,9 @@ func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, erro

// Read all event JSON files from the CSE events directory, explicitly
// appending a newline after each file so each JSON document is separated.
// Search both the primary events directory and any handler-version subdirectories,
// as the Guest Agent may move events between these locations.
listCmd := fmt.Sprintf(
"sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null",
cseEventsDir,
)
// Search the CustomScript directory tree for any events/ subdirectories,
// as the Guest Agent may store events in handler-version subdirectories.
listCmd := "sudo find /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null"
result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd)
Comment thread
r2k1 marked this conversation as resolved.
if err != nil {
return nil, fmt.Errorf("failed to read CSE events: %w", err)
Expand Down
24 changes: 2 additions & 22 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana
}, nil
}

func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace string, labelSelector string, fieldSelector string, maxRetries int) (*corev1.Pod, error) {
func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
defer toolkit.LogStepCtxf(ctx, "waiting for pod %s %s in %q namespace", labelSelector, fieldSelector, namespace)()
var pod *corev1.Pod

Expand All @@ -101,22 +101,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
}
}

// Check for FailedCreatePodSandBox events
events, err := k.Typed.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + pod.Name})
if err == nil {
for _, event := range events.Items {
if event.Reason == "FailedCreatePodSandBox" {
maxRetries--
sandboxErr := fmt.Errorf("pod %s has FailedCreatePodSandBox event: %s", pod.Name, event.Message)
if maxRetries <= 0 {
return false, sandboxErr
}
k.Typed.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: to.Ptr(int64(0))})
return false, nil // Keep polling
}
}
}

switch pod.Status.Phase {
case corev1.PodFailed:
logPodDebugInfo(ctx, k, pod)
Expand All @@ -142,10 +126,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
return pod, err
}

func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
return k.WaitUntilPodRunningWithRetry(ctx, namespace, labelSelector, fieldSelector, 0)
}

func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
var lastNode *corev1.Node
Expand Down Expand Up @@ -199,7 +179,7 @@ func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeN
if kubeNodeName == "" {
return nil, fmt.Errorf("kubeNodeName must not be empty")
}
return k.WaitUntilPodRunningWithRetry(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName, 3)
return k.WaitUntilPodRunning(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName)
Copy link
Copy Markdown
Contributor Author

@r2k1 r2k1 May 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The retry didn't work. It re-created pod with the same name preserving the events.
The retry found previous events and failed the test (when it suppose to continue, k8s automatically retries pod creation in this case, such events are transient and can be ignored)

}

func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) {
Expand Down
66 changes: 42 additions & 24 deletions e2e/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/e2e/toolkit"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -269,15 +268,27 @@ func getIPTablesRulesCompatibleWithEBPFHostRouting() (map[string][]string, []str
}

// validateWireServerBlocked checks that unprivileged pods cannot reach WireServer.
// The iptables FORWARD DROP rules blocking pod→WireServer traffic can be transiently
// absent when kube-proxy or CNI flush/recreate iptables chains during node setup.
// We resolve the debug pod once up front (outside the retry budget) so that pod
// scheduling latency doesn't eat into the iptables-check timeout.
// Wireserver must never be reachable from pods — any successful connection is a
// security issue, not a transient condition to retry through.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont commit and ensure stability before removing the retry. or else this will keep generating issues, this test run with every e2e

//
// We accept two curl exit codes as evidence of a working block:
//
// 28 = operation timeout (FORWARD DROP — packets silently dropped)
// 7 = couldn't connect (FORWARD REJECT — RST / ICMP unreachable)
//
// Any other exit code is suspicious and fails the test with full diagnostics:
//
// 0 = wireserver reachable (security regression)
// 127 = curl missing from debug image (test would otherwise silently bypass)
// 2/3 = invalid curl args
// 6 = DNS resolution issue (wireserver IP is literal — should not happen)
//
// We do retry transient kube-apiserver exec hiccups, but never on the curl
// result itself — a single observation of an unexpected exit code is enough
// to fail loudly.
func validateWireServerBlocked(ctx context.Context, s *Scenario) {
defer toolkit.LogStep(s.T, "validating wireserver is blocked from unprivileged pods")()

// Resolve the unprivileged debug pod once — this can take 25-30s on cold nodes.
// Using the parent context so it has the full scenario timeout, not the short poll timeout.
nonHostPod, err := s.Runtime.Cluster.Kube.GetPodNetworkDebugPodForNode(ctx, s.Runtime.VM.KubeName)
require.NoError(s.T, err, "failed to get non host debug pod for wireserver validation")

Expand All @@ -297,30 +308,37 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {
},
}

allowedExitCodes := map[string]bool{"28": true, "7": true}

for _, check := range checks {
var lastResult *podExecResult
err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) {
execResult, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
var execResult *podExecResult
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
if execErr != nil {
s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
return false, nil
}
lastResult = execResult
if lastResult.exitCode == "28" {
return true, nil
}
s.T.Logf("wireserver check %q: expected exit code 28, got %s (retrying)", check.desc, lastResult.exitCode)
return false, nil
execResult = r
return true, nil
})
if err != nil {
s.T.Logf("host IPTABLES: %s", execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String())
if lastResult == nil {
require.NoErrorf(s.T, err, "curl to %s did not complete before polling stopped", check.desc)
}
s.T.Logf("last curl result for %s: %s", check.desc, lastResult.String())
assert.Equal(s.T, "28", lastResult.exitCode, "curl to %s expected to fail with timeout, but it didn't after retries", check.desc)
s.T.FailNow()
require.NoErrorf(s.T, pollErr, "wireserver check %q: exec failed after retries", check.desc)

if allowedExitCodes[execResult.exitCode] {
continue
}

iptablesFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String()
iptablesKubeFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L KUBE-FORWARD -v -n --line-numbers 2>/dev/null || echo 'chain not found'").String()
iptablesSave := execScriptOnVMForScenario(ctx, s, "sudo iptables-save -t filter 2>/dev/null | head -80").String()
conntrack := execScriptOnVMForScenario(ctx, s, "sudo conntrack -L -d 168.63.129.16 2>/dev/null || echo 'conntrack not available'").String()
s.T.Fatalf("wireserver check %q: unexpected curl exit code %q (want 28 timeout or 7 refused)\n"+
"stdout=%q, stderr=%q\n"+
"FORWARD chain:\n%s\n"+
"KUBE-FORWARD chain:\n%s\n"+
"iptables-save filter:\n%s\n"+
"conntrack:\n%s",
check.desc, execResult.exitCode, execResult.stdout, execResult.stderr,
iptablesFwd, iptablesKubeFwd, iptablesSave, conntrack)
}
}

Expand Down
Loading