Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
872 changes: 841 additions & 31 deletions components/linux/action.pb.go

Large diffs are not rendered by default.

96 changes: 95 additions & 1 deletion components/linux/action.proto
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,98 @@ message ConfigureIPTablesSpec {
}

message ConfigureIPTablesStatus {
}
}

// ConfigureStaticRoutes installs one or more static IPv4 routes on the node
// via a systemd oneshot unit that runs `ip route replace` before kubelet
// starts. This is intended for cases where the VM provider's default routing
// is wrong for the cluster — for example, Azure ND-isr SKUs install connected
// /16 routes for the InfiniBand fabric that can shadow legitimate cluster
// CIDRs. More-specific /24 routes added via this action win over the IB /16
// without disturbing peer-to-peer IB traffic.
message ConfigureStaticRoutes {
api.Metadata metadata = 1;

ConfigureStaticRoutesSpec spec = 2;

ConfigureStaticRoutesStatus status = 3;
}

message ConfigureStaticRoutesSpec {
// enabled must be explicitly set to true before any routes are applied.
// This makes static-route injection an intentional opt-in: operators should
// only enable it after confirming a real routing overlap (for example, an
// IB /16 colliding with cluster/VNet CIDRs).
bool enabled = 2;

// routes is the list of static routes to install. Order is preserved but
// does not affect kernel selection (longest-prefix-match wins).
repeated StaticRoute routes = 1;
}

message ConfigureStaticRoutesStatus {
}

// StaticRoute describes a single IPv4 route. IPv6 is intentionally not
// supported; validation rejects non-IPv4 destinations and gateways.
message StaticRoute {
// destination is an IPv4 CIDR (e.g. "172.16.1.0/24"). Required.
string destination = 1;

// gateway is the next-hop IPv4 address. When empty the oneshot resolves
// the default gateway on `dev` at boot time (with a bounded retry, since
// DHCP may not have installed the default route yet). If the default
// gateway does not appear within the retry window, the oneshot fails
// and kubelet will not start.
string gateway = 2;

// dev is the outbound interface (e.g. "eth0"). When empty the oneshot
// resolves the outbound interface of the IPv4 default route at boot
// time — works with both classic (eth0) and predictable (ens*, enp*)
// interface names. Must match [A-Za-z0-9_.-]{1,15} when set.
string dev = 3;

// metric sets the route metric for tie-breaking. 0 means default.
uint32 metric = 4;
}

// CheckRouteOverlap verifies at boot, before kubelet starts, that a list
// of expected IPv4 CIDRs (typically the cluster pod CIDR, service CIDR,
// and API server) actually route via the same interface as the IPv4
// default route. When a CIDR resolves out a different interface — the
// classic symptom is the H200 IB driver shadowing a customer VNet CIDR
// with a connected /16 on ib0 — the check either logs a warning or
// fails the boot, depending on `mode`.
message CheckRouteOverlap {
api.Metadata metadata = 1;

CheckRouteOverlapSpec spec = 2;

CheckRouteOverlapStatus status = 3;
}

message CheckRouteOverlapSpec {
// expected_cidrs are IPv4 CIDRs that kubelet, kube-proxy, and pods
// must be able to reach via the IPv4 default route's outbound
// interface. Typically populated by the controller from the cluster's
// pod CIDR, service CIDR, and API server endpoint.
repeated string expected_cidrs = 1;

// mode controls what happens when an overlap is detected.
// WARN : log + write /run/aks-flex-node/route-overlap.detected;
// kubelet starts anyway.
// STRICT : same logging, then exit 1 — kubelet does not start
// (the unit is RequiredBy=kubelet.service). Use STRICT in
// production where a misrouted node is worse than a node
// that won't join.
Mode mode = 2;

enum Mode {
MODE_UNSPECIFIED = 0;
WARN = 1;
STRICT = 2;
}
}

message CheckRouteOverlapStatus {
}
15 changes: 15 additions & 0 deletions components/linux/v20260301/assets/check-route-overlap.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=AKSFlexNode IPv4 route overlap pre-flight check
After=network-online.target static-routes.service
Wants=network-online.target
Before=kubelet.service

Comment thread
chokevin marked this conversation as resolved.
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash /etc/aks-flex-node/check-route-overlap.sh
StandardOutput=journal
StandardError=journal

[Install]
RequiredBy=kubelet.service
54 changes: 54 additions & 0 deletions components/linux/v20260301/assets/check-route-overlap.sh.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
# Generated by AKSFlexNode CheckRouteOverlap. Do not edit. mode={{ .ModeLabel }}
set -eu
PATH=/usr/sbin:/sbin:/usr/bin:/bin:${PATH:-}

mkdir -p /run/aks-flex-node
rm -f /run/aks-flex-node/route-overlap.detected
rm -f /run/aks-flex-node/route-overlap.ok

DEFAULT_DEV=$(ip -4 route show default 2>/dev/null | awk '/^default / {for (i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}')
if [ -z "$DEFAULT_DEV" ]; then
echo "check-route-overlap: no IPv4 default route; cannot determine outbound interface" >&2
echo "no-default-route" > /run/aks-flex-node/route-overlap.detected
exit {{ .FailExit }}
fi

{{- if .HasEntries }}
bad=0
while IFS='|' read -r CIDR PROBE; do
[ -z "$CIDR" ] && continue
ACTUAL=$(ip -4 route get "$PROBE" 2>/dev/null | awk '{for (i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}')
if [ -z "$ACTUAL" ]; then ACTUAL="<no-route>"; fi
if [ "$ACTUAL" != "$DEFAULT_DEV" ]; then
if [ "$ACTUAL" = "<no-route>" ]; then
msg="NO-ROUTE: expected CIDR $CIDR (probe $PROBE) has no IPv4 route; expected via $DEFAULT_DEV"
else
msg="OVERLAP: expected CIDR $CIDR (probe $PROBE) routes via $ACTUAL, expected $DEFAULT_DEV"
fi
echo "$msg" >&2
echo "$msg" >> /run/aks-flex-node/route-overlap.detected
bad=1
fi
done <<'EOF'
{{ .Entries }}
EOF

if [ "$bad" -eq 1 ]; then
cat >&2 <<'EOF'
Action: configure spec.staticRoutes on the NodeClass with more-specific
routes for the affected CIDRs, or rebuild the cluster on a non-overlapping
VNet CIDR. For each affected CIDR, add a spec.staticRoutes entry with the
destination CIDR and next-hop/default gateway for the node's normal outbound interface.
EOF
exit {{ .FailExit }}
fi

echo "check-route-overlap: all expected CIDRs route via $DEFAULT_DEV"
touch /run/aks-flex-node/route-overlap.ok
exit 0
{{- else }}
echo "check-route-overlap: no expected CIDRs configured; nothing to check (default dev: $DEFAULT_DEV)"
touch /run/aks-flex-node/route-overlap.ok
exit 0
{{- end }}
16 changes: 16 additions & 0 deletions components/linux/v20260301/assets/static-routes.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=Install AKSFlexNode static routes
# Route install must happen before kubelet tries to reach cluster services
# whose CIDR may otherwise be shadowed by provider-installed connected
# routes (e.g. the Azure InfiniBand fabric /16 on ND-isr SKUs).
Before=kubelet.service
After=network-online.target
Wants=network-online.target

[Service]
Type=oneshot
ExecStart=/etc/aks-flex-node/static-routes.sh
RemainAfterExit=yes
Comment thread
chokevin marked this conversation as resolved.

[Install]
RequiredBy=kubelet.service
59 changes: 59 additions & 0 deletions components/linux/v20260301/assets/static-routes.sh.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
# Generated by AKSFlexNode ConfigureStaticRoutes. Do not edit.
set -eu
PATH=/usr/sbin:/sbin:/usr/bin:/bin:${PATH:-}

# resolve_default_gw <dev>: prints the default gateway for <dev> after retrying
# for up to ~30s, in case cloud-init / DHCP has not installed it yet.
resolve_default_gw() {
local dev="$1"
local i gw
for i in $(seq 1 30); do
gw=$(ip -4 route show default dev "$dev" 2>/dev/null | awk '/^default via/ {print $3; exit}')
if [ -n "$gw" ]; then echo "$gw"; return 0; fi
sleep 1
done
return 1
}

# resolve_default_dev: prints the outbound interface of the IPv4 default
# route (e.g. eth0, ens3, enp0s6). Retries up to ~30s for DHCP.
resolve_default_dev() {
local i dev
for i in $(seq 1 30); do
dev=$(ip -4 route show default 2>/dev/null | awk '/^default / {for (i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}')
if [ -n "$dev" ]; then echo "$dev"; return 0; fi
sleep 1
done
return 1
Comment thread
chokevin marked this conversation as resolved.
}

{{- if .HasEntries }}
DEFAULT_DEV=""
resolve_default_dev_cached() {
if [ -n "$DEFAULT_DEV" ]; then echo "$DEFAULT_DEV"; return 0; fi
DEFAULT_DEV=$(resolve_default_dev) || return 1
echo "$DEFAULT_DEV"
}

while IFS='|' read -r DEST DEV GW METRIC; do
[ -z "$DEST" ] && continue
if [ "$DEV" = "{{ .AutoDevToken }}" ]; then
DEV=$(resolve_default_dev_cached) || { echo "no default IPv4 route; cannot install route $DEST" >&2; exit 1; }
fi
if [ "$GW" = "{{ .AutoGWToken }}" ]; then
GW=$(resolve_default_gw "$DEV") || { echo "no default gateway on $DEV after 30s; cannot install route $DEST" >&2; exit 1; }
fi
if [ "$METRIC" -gt 0 ]; then
ip -4 route replace "$DEST" via "$GW" dev "$DEV" metric "$METRIC"
else
ip -4 route replace "$DEST" via "$GW" dev "$DEV"
fi
done <<'EOF'
{{ .Entries }}
EOF
exit 0
{{- else }}
# No routes configured; nothing to do.
exit 0
{{- end }}
Loading
Loading