Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 58 additions & 16 deletions e2e/aks_model.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,27 +310,17 @@ func addFirewallRules(
return err
}

// Find the AKS-managed route table currently associated with the subnet.
// We add firewall routes directly to this table so that both pod routes
// (managed by cloud-provider-azure) and firewall routes coexist. Creating
// a separate route table and swapping the subnet association disconnects
// the pod routes and breaks kubenet networking.
// For kubenet, the AKS-managed route table must stay attached so that pod
// routes (managed by cloud-provider-azure) and firewall routes coexist.
// For Azure CNI variants, the subnet may not have any route table, so we
// create and associate a dedicated one before adding the firewall routes.
aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil)
if err != nil {
return fmt.Errorf("failed to get AKS subnet: %w", err)
Comment on lines 317 to 319
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addFirewallRules fetches the subnet using the hard-coded name "aks-subnet", but the association update path (updateSubnet) always writes to config.Config.DefaultSubnetName. If DEFAULT_SUBNET_NAME is overridden, this can end up updating a different subnet than the one you just read (and the route table association won’t be applied where expected). Consider using config.Config.DefaultSubnetName consistently for the Subnet.Get call, or pass the subnet name through to updateSubnet and use the name from the fetched subnet.

Suggested change
aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil)
if err != nil {
return fmt.Errorf("failed to get AKS subnet: %w", err)
aksSubnetName := config.Config.DefaultSubnetName
aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, aksSubnetName, nil)
if err != nil {
return fmt.Errorf("failed to get AKS subnet %q: %w", aksSubnetName, err)

Copilot uses AI. Check for mistakes.
}
if aksSubnetResp.Properties == nil || aksSubnetResp.Properties.RouteTable == nil || aksSubnetResp.Properties.RouteTable.ID == nil {
return fmt.Errorf("AKS subnet has no route table associated")
}

aksRTID := *aksSubnetResp.Properties.RouteTable.ID
parsedRT, err := arm.ParseResourceID(aksRTID)
aksRTName, err := ensureFirewallRouteTable(ctx, clusterModel, vnet.name, aksSubnetResp.Subnet)
if err != nil {
return fmt.Errorf("failed to parse AKS route table resource ID %q: %w", aksRTID, err)
}
aksRTName := parsedRT.Name
if aksRTName == "" {
return fmt.Errorf("parsed empty route table name from resource ID %q", aksRTID)
return err
Comment on lines 317 to +323
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addFirewallRules reads subnet "aks-subnet" by a hard-coded name, but the new Azure CNI fallback path associates the route table via updateSubnet, which uses config.Config.DefaultSubnetName. If DEFAULT_SUBNET_NAME is overridden, this can update a different subnet than the one you fetched, leaving the AKS subnet without the route table and causing route creation to fail. Suggest using a single source of truth for the subnet name (e.g., config.Config.DefaultSubnetName) and/or threading the actual subnet name through to the update call.

Copilot uses AI. Check for mistakes.
}

// Create AzureFirewallSubnet - this subnet name is required by Azure Firewall
Expand Down Expand Up @@ -456,6 +446,58 @@ func addFirewallRules(
return nil
}

func ensureFirewallRouteTable(
ctx context.Context,
clusterModel *armcontainerservice.ManagedCluster,
vnetName string,
aksSubnet armnetwork.Subnet,
) (string, error) {
if aksSubnet.Properties == nil {
return "", fmt.Errorf("AKS subnet has no properties")
}
if aksSubnet.Properties.RouteTable != nil && aksSubnet.Properties.RouteTable.ID != nil {
aksRTID := *aksSubnet.Properties.RouteTable.ID
parsedRT, err := arm.ParseResourceID(aksRTID)
if err != nil {
return "", fmt.Errorf("failed to parse AKS route table resource ID %q: %w", aksRTID, err)
}
if parsedRT.Name == "" {
return "", fmt.Errorf("parsed empty route table name from resource ID %q", aksRTID)
}
return parsedRT.Name, nil
}

if clusterModel.Properties == nil || clusterModel.Properties.NetworkProfile == nil || clusterModel.Properties.NetworkProfile.NetworkPlugin == nil {
return "", fmt.Errorf("AKS subnet has no route table associated and cluster network plugin is unknown")
}
if *clusterModel.Properties.NetworkProfile.NetworkPlugin == armcontainerservice.NetworkPluginKubenet {
return "", fmt.Errorf("AKS subnet has no route table associated for kubenet cluster")
}

rg := *clusterModel.Properties.NodeResourceGroup
routeTableName := "abe2e-fw-rt"
toolkit.Logf(ctx, "AKS subnet has no route table; creating dedicated firewall route table %q", routeTableName)
poller, err := config.Azure.RouteTables.BeginCreateOrUpdate(ctx, rg, routeTableName, armnetwork.RouteTable{
Location: clusterModel.Location,
}, nil)
if err != nil {
return "", fmt.Errorf("failed to start creating firewall route table %q: %w", routeTableName, err)
}
routeTableResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
if err != nil {
return "", fmt.Errorf("failed to create firewall route table %q: %w", routeTableName, err)
}

aksSubnet.Properties.RouteTable = &armnetwork.RouteTable{
ID: routeTableResp.ID,
}
if err := updateSubnet(ctx, clusterModel, aksSubnet, vnetName); err != nil {
return "", fmt.Errorf("failed to associate firewall route table %q with AKS subnet: %w", routeTableName, err)
}

return routeTableName, nil
}

Comment on lines +494 to +500
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensureFirewallRouteTable mutates the subnet returned by Subnet.Get, but then calls updateSubnet() which always targets config.Config.DefaultSubnetName rather than the subnet you just retrieved. This makes the association step brittle and (when the default name differs) incorrect. Consider changing updateSubnet to accept an explicit subnet name (or derive it from aksSubnet.ID) and pass the name of the subnet being updated.

Suggested change
if err := updateSubnet(ctx, clusterModel, aksSubnet, vnetName); err != nil {
return "", fmt.Errorf("failed to associate firewall route table %q with AKS subnet: %w", routeTableName, err)
}
return routeTableName, nil
}
subnetName, err := subnetNameFromSubnet(aksSubnet)
if err != nil {
return "", fmt.Errorf("failed to determine AKS subnet name for firewall route table association: %w", err)
}
if err := updateSubnetByName(ctx, clusterModel, aksSubnet, vnetName, subnetName); err != nil {
return "", fmt.Errorf("failed to associate firewall route table %q with AKS subnet %q: %w", routeTableName, subnetName, err)
}
return routeTableName, nil
}
func subnetNameFromSubnet(subnet armnetwork.Subnet) (string, error) {
if subnet.Name != nil && *subnet.Name != "" {
return *subnet.Name, nil
}
if subnet.ID == nil || *subnet.ID == "" {
return "", fmt.Errorf("subnet has neither name nor ID")
}
parsedSubnet, err := arm.ParseResourceID(*subnet.ID)
if err != nil {
return "", fmt.Errorf("failed to parse subnet resource ID %q: %w", *subnet.ID, err)
}
if parsedSubnet.Name == "" {
return "", fmt.Errorf("parsed empty subnet name from resource ID %q", *subnet.ID)
}
return parsedSubnet.Name, nil
}
func updateSubnetByName(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, subnet armnetwork.Subnet, vnetName, subnetName string) error {
if clusterModel == nil || clusterModel.Properties == nil || clusterModel.Properties.NodeResourceGroup == nil {
return fmt.Errorf("cluster model has no node resource group")
}
subnetsClient, err := armnetwork.NewSubnetsClient(config.Config.SubscriptionID, config.Azure.Credential, config.Azure.ArmOptions)
if err != nil {
return fmt.Errorf("failed to create subnets client: %w", err)
}
poller, err := subnetsClient.BeginCreateOrUpdate(ctx, *clusterModel.Properties.NodeResourceGroup, vnetName, subnetName, subnet, nil)
if err != nil {
return fmt.Errorf("failed to start updating subnet %q: %w", subnetName, err)
}
if _, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions); err != nil {
return fmt.Errorf("failed to update subnet %q: %w", subnetName, err)
}
return nil
}

Copilot uses AI. Check for mistakes.
func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient, kubeletIdentity *armcontainerservice.UserAssignedIdentity, isNonAnonymousPull bool) error {
if cluster == nil || kube == nil || kubeletIdentity == nil {
return errors.New("cluster, kubeclient, and kubeletIdentity cannot be nil when adding Private Azure Container Registry")
Expand Down
5 changes: 3 additions & 2 deletions e2e/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) {
return getClusterKubeletIdentity(ctx, cluster)
})
// networkSetup adds firewall routes to the AKS route table or applies
// network-isolated NSG. It must run after bastion (both mutate the
// networkSetup adds firewall routes to the existing AKS route table or
// creates/associates a dedicated one when Azure CNI has none, or applies
// the network-isolated NSG. It must run after bastion (both mutate the
// VNet) and before collectGarbageVMSS (which needs network setup done).
// collectGarbageVMSS also depends on kube to clean up stale K8s Node
// objects whose backing VMSS no longer exist.
Expand Down
3 changes: 2 additions & 1 deletion e2e/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"time"

"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/e2e/toolkit"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -73,7 +74,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
}

// localdns is not supported on scriptless, privatekube and VHDUbuntu2204Gen2ContainerdNetworkIsolatedK8sNotCached.
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment above this condition is now stale: the code also skips LocalDNS validation when running in TEST_PRE_PROVISION mode or when VHDCaching is enabled. Update the comment to reflect the full set of reasons LocalDNS validation is skipped so it stays accurate for future debugging.

Suggested change
// localdns is not supported on scriptless, privatekube and VHDUbuntu2204Gen2ContainerdNetworkIsolatedK8sNotCached.
// Skip LocalDNS validation when it is unsupported on the VHD (for example scriptless, privatekube,
// and VHDUbuntu2204Gen2ContainerdNetworkIsolatedK8sNotCached), when running in TEST_PRE_PROVISION
// mode, or when VHDCaching is enabled.

Copilot uses AI. Check for mistakes.
if !s.VHD.UnsupportedLocalDns {
if !s.VHD.UnsupportedLocalDns && !config.Config.TestPreProvision && !s.VHDCaching {
ValidateLocalDNSService(ctx, s, "enabled")
ValidateLocalDNSResolution(ctx, s, "169.254.10.10")
ValidateLocalDNSExporterMetrics(ctx, s)
Expand Down
Loading