From 55caf8683fb0101c49eaa4f2e520740d55ac54a0 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Mon, 15 Apr 2024 20:37:04 -0700 Subject: [PATCH 1/6] feat: alpha support for artifact streaming --- pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml | 4 ++++ pkg/apis/v1alpha2/aksnodeclass.go | 3 +++ pkg/providers/imagefamily/bootstrap/aksbootstrap.go | 4 +++- pkg/providers/imagefamily/bootstrap/bootstrap.go | 1 + .../imagefamily/bootstrap/containerd.toml.gtpl | 10 ++++++++++ pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl | 1 + pkg/providers/imagefamily/ubuntu_2204.go | 1 + pkg/providers/launchtemplate/launchtemplate.go | 1 + pkg/providers/launchtemplate/parameters/types.go | 3 +-- 9 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml index 8cb872a0e..0229d2dad 100644 --- a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml +++ b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml @@ -46,6 +46,10 @@ spec: AKSNodeClassSpec is the top level specification for the AKS Karpenter Provider. This will contain configuration necessary to launch instances in AKS. properties: + artifactStreamingEnabled: + default: false + description: AritfactStreamingEnabled is a flag to enable https://aka.ms/artifactstreaming + type: boolean imageFamily: default: Ubuntu2204 description: ImageFamily is the image family that instances use. diff --git a/pkg/apis/v1alpha2/aksnodeclass.go b/pkg/apis/v1alpha2/aksnodeclass.go index ce9ea247a..5736ee7ec 100644 --- a/pkg/apis/v1alpha2/aksnodeclass.go +++ b/pkg/apis/v1alpha2/aksnodeclass.go @@ -27,6 +27,9 @@ import ( // AKSNodeClassSpec is the top level specification for the AKS Karpenter Provider. // This will contain configuration necessary to launch instances in AKS. type AKSNodeClassSpec struct { + // AritfactStreamingEnabled is a flag to enable https://aka.ms/artifactstreaming + // +kubebuilder:default=false + ArtifactStreamingEnabled *bool `json:"artifactStreamingEnabled,omitempty"` // +kubebuilder:default=128 // +kubebuilder:validation:Minimum=100 // osDiskSizeGB is the size of the OS disk in GB. diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go index 20f972131..c0f0e2187 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go @@ -219,6 +219,7 @@ type NodeBootstrapVariables struct { KubenetTemplate string // s static ContainerdConfigContent string // k determined by GPU VM size, WASM support, Kata support IsKata bool // n user-specified + ArtifactStreamingEnabled bool // t user-specified } var ( @@ -384,7 +385,7 @@ var ( KubenetTemplate: base64.StdEncoding.EncodeToString(kubenetTemplate), // s ContainerdConfigContent: "", // kd IsKata: false, // n - + ArtifactStreamingEnabled: false, // td } ) @@ -448,6 +449,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) { nbv.GPUDriverVersion = a.GPUDriverVersion nbv.GPUImageSHA = a.GPUImageSHA } + nbv.ArtifactStreamingEnabled = a.ArtifactStreamingEnabled nbv.NeedsCgroupV2 = true // merge and stringify labels kubeletLabels := lo.Assign(kubeletNodeLabelsBase, a.Labels) diff --git a/pkg/providers/imagefamily/bootstrap/bootstrap.go b/pkg/providers/imagefamily/bootstrap/bootstrap.go index 2bf1fab8d..5fb2c87c3 100644 --- a/pkg/providers/imagefamily/bootstrap/bootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/bootstrap.go @@ -33,6 +33,7 @@ type Options struct { GPUDriverVersion string GPUImageSHA string SubnetID string + ArtifactStreamingEnabled bool } // Bootstrapper can be implemented to generate a bootstrap script diff --git a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl index ed7ef261a..747936d36 100644 --- a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl +++ b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl @@ -3,6 +3,10 @@ oom_score = 0 [plugins."io.containerd.grpc.v1.cri"] sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" [plugins."io.containerd.grpc.v1.cri".containerd] + {{- if .ArtifactStreamingEnabled }} + snapshotter = "overlaybd" + disable_snapshot_annotations = false + {- end}} {{- if .GPUNode }} default_runtime_name = "nvidia-container-runtime" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime] @@ -42,3 +46,9 @@ oom_score = 0 X-Meta-Source-Client = ["azure/aks"] [metrics] address = "0.0.0.0:10257" +{{- if .ArtifactStreamingEnabled }} +[proxy_plugins] + [proxy_plugins.overlaybd] + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" +{{- end}} diff --git a/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl b/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl index b7cbbabe3..1159eeb62 100644 --- a/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl +++ b/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl @@ -18,6 +18,7 @@ REPO_DEPOT_ENDPOINT="{{.AKSCustomCloudRepoDepotEndpoint}}" {{.InitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} ADMINUSER={{.AdminUsername}} +ARTIFACT_STREAMING_ENABLED={{.ArtifactStreamingEnabled}} MOBY_VERSION={{.MobyVersion}} TENANT_ID={{.TenantID}} KUBERNETES_VERSION={{.KubernetesVersion}} diff --git a/pkg/providers/imagefamily/ubuntu_2204.go b/pkg/providers/imagefamily/ubuntu_2204.go index a7c3b8ee2..a2f59bea9 100644 --- a/pkg/providers/imagefamily/ubuntu_2204.go +++ b/pkg/providers/imagefamily/ubuntu_2204.go @@ -86,6 +86,7 @@ func (u Ubuntu2204) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta GPUDriverVersion: u.Options.GPUDriverVersion, GPUImageSHA: u.Options.GPUImageSHA, SubnetID: u.Options.SubnetID, + ArtifactStreamingEnabled: u.Options.ArtifactStreamingEnabled, }, Arch: u.Options.Arch, TenantID: u.Options.TenantID, diff --git a/pkg/providers/launchtemplate/launchtemplate.go b/pkg/providers/launchtemplate/launchtemplate.go index 3c5e5ac95..a03ca07e7 100644 --- a/pkg/providers/launchtemplate/launchtemplate.go +++ b/pkg/providers/launchtemplate/launchtemplate.go @@ -133,6 +133,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp ClusterName: options.FromContext(ctx).ClusterName, ClusterEndpoint: p.clusterEndpoint, Tags: nodeClass.Spec.Tags, + ArtifactStreamingEnabled: lo.FromPtr(nodeClass.Spec.ArtifactStreamingEnabled), Labels: labels, CABundle: p.caBundle, Arch: arch, diff --git a/pkg/providers/launchtemplate/parameters/types.go b/pkg/providers/launchtemplate/parameters/types.go index 238ce0710..bbec39e81 100644 --- a/pkg/providers/launchtemplate/parameters/types.go +++ b/pkg/providers/launchtemplate/parameters/types.go @@ -40,9 +40,8 @@ type StaticParameters struct { NetworkPlugin string NetworkPolicy string KubernetesVersion string - - // VNET SubnetID string + ArtifactStreamingEnabled bool Tags map[string]string Labels map[string]string From 6a6c9876ed31b301c95a9f53517a41865b4de6d6 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Tue, 16 Apr 2024 08:34:03 -0700 Subject: [PATCH 2/6] fix: containerd config needs proxy plugins under plugins --- .../imagefamily/bootstrap/containerd.toml.gtpl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl index 747936d36..651e49c26 100644 --- a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl +++ b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl @@ -44,11 +44,11 @@ oom_score = 0 config_path = "/etc/containerd/certs.d" [plugins."io.containerd.grpc.v1.cri".registry.headers] X-Meta-Source-Client = ["azure/aks"] + {{- if .ArtifactStreamingEnabled }} + [proxy_plugins] + [proxy_plugins.overlaybd] + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" + {{- end}} [metrics] address = "0.0.0.0:10257" -{{- if .ArtifactStreamingEnabled }} -[proxy_plugins] - [proxy_plugins.overlaybd] - type = "snapshot" - address = "/run/overlaybd-snapshotter/overlaybd.sock" -{{- end}} From f331a01b91663ecb3313dcd298bc012824512257 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Wed, 17 Apr 2024 13:50:50 -0700 Subject: [PATCH 3/6] feat: pushing containerd config fix --- .../imagefamily/bootstrap/containerd.toml.gtpl | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl index 651e49c26..d69410619 100644 --- a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl +++ b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl @@ -4,9 +4,12 @@ oom_score = 0 sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" [plugins."io.containerd.grpc.v1.cri".containerd] {{- if .ArtifactStreamingEnabled }} - snapshotter = "overlaybd" - disable_snapshot_annotations = false - {- end}} + snapshotter = "overlaybd" + disable_snapshot_annotations = false + [proxy_plugins.overlaybd] + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" + {{- end}} {{- if .GPUNode }} default_runtime_name = "nvidia-container-runtime" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime] @@ -44,11 +47,5 @@ oom_score = 0 config_path = "/etc/containerd/certs.d" [plugins."io.containerd.grpc.v1.cri".registry.headers] X-Meta-Source-Client = ["azure/aks"] - {{- if .ArtifactStreamingEnabled }} - [proxy_plugins] - [proxy_plugins.overlaybd] - type = "snapshot" - address = "/run/overlaybd-snapshotter/overlaybd.sock" - {{- end}} [metrics] address = "0.0.0.0:10257" From d7e534012313e6f9b1e8665a5d2e8ae2428e7b83 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Tue, 7 May 2024 00:18:39 -0700 Subject: [PATCH 4/6] fix: new artifact streaming containerd configuration --- .../bootstrap/containerd.toml.gtpl | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl index d69410619..ae65e262d 100644 --- a/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl +++ b/pkg/providers/imagefamily/bootstrap/containerd.toml.gtpl @@ -1,14 +1,15 @@ version = 2 oom_score = 0 [plugins."io.containerd.grpc.v1.cri"] - sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" + sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" [plugins."io.containerd.grpc.v1.cri".containerd] - {{- if .ArtifactStreamingEnabled }} - snapshotter = "overlaybd" - disable_snapshot_annotations = false - [proxy_plugins.overlaybd] - type = "snapshot" - address = "/run/overlaybd-snapshotter/overlaybd.sock" + {{- if .ArtifactStreamingEnabled }} + snapshotter = "overlaybd" + disable_snapshot_annotations = false + [proxy_plugins] + [proxy_plugins.overlaybd] + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" {{- end}} {{- if .GPUNode }} default_runtime_name = "nvidia-container-runtime" @@ -16,7 +17,7 @@ oom_score = 0 runtime_type = "io.containerd.runc.v2" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime.options] BinaryName = "/usr/bin/nvidia-container-runtime" - {{- if .NeedsCgroupV2}} + {{- if .NeedsCgroupV2 }} SystemdCgroup = true {{- end}} [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] @@ -29,7 +30,7 @@ oom_score = 0 runtime_type = "io.containerd.runc.v2" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] BinaryName = "/usr/bin/runc" - {{- if .NeedsCgroupV2}} + {{- if .NeedsCgroupV2 }} SystemdCgroup = true {{- end}} [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] @@ -37,15 +38,16 @@ oom_score = 0 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options] BinaryName = "/usr/bin/runc" {{- end}} - {{- if .EnsureNoDupePromiscuousBridge }} + {{- if .EnsureNoDupePromiscuousBridge }} [plugins."io.containerd.grpc.v1.cri".cni] - bin_dir = "/opt/cni/bin" - conf_dir = "/etc/cni/net.d" - conf_template = "/etc/containerd/kubenet_template.conf" - {{- end}} - [plugins."io.containerd.grpc.v1.cri".registry] - config_path = "/etc/containerd/certs.d" - [plugins."io.containerd.grpc.v1.cri".registry.headers] - X-Meta-Source-Client = ["azure/aks"] + bin_dir = "/opt/cni/bin" + conf_dir = "/etc/cni/net.d" + conf_template = "/etc/containerd/kubenet_template.conf" + {{- end}} + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "/etc/containerd/certs.d" + [plugins."io.containerd.grpc.v1.cri".registry.headers] + X-Meta-Source-Client = ["azure/aks"] [metrics] address = "0.0.0.0:10257" + From 6b99af2d772a1a7adab27844a56411f6cfa28fc5 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Tue, 7 May 2024 00:25:48 -0700 Subject: [PATCH 5/6] fix: removing nodeclass usage of artifact streaming enabling by default --- pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml | 4 ---- pkg/apis/v1alpha2/aksnodeclass.go | 3 --- pkg/providers/imagefamily/bootstrap/aksbootstrap.go | 3 +-- pkg/providers/imagefamily/bootstrap/bootstrap.go | 1 - pkg/providers/imagefamily/ubuntu_2204.go | 1 - pkg/providers/launchtemplate/launchtemplate.go | 1 - pkg/providers/launchtemplate/parameters/types.go | 1 - 7 files changed, 1 insertion(+), 13 deletions(-) diff --git a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml index 0229d2dad..8cb872a0e 100644 --- a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml +++ b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml @@ -46,10 +46,6 @@ spec: AKSNodeClassSpec is the top level specification for the AKS Karpenter Provider. This will contain configuration necessary to launch instances in AKS. properties: - artifactStreamingEnabled: - default: false - description: AritfactStreamingEnabled is a flag to enable https://aka.ms/artifactstreaming - type: boolean imageFamily: default: Ubuntu2204 description: ImageFamily is the image family that instances use. diff --git a/pkg/apis/v1alpha2/aksnodeclass.go b/pkg/apis/v1alpha2/aksnodeclass.go index 5736ee7ec..ce9ea247a 100644 --- a/pkg/apis/v1alpha2/aksnodeclass.go +++ b/pkg/apis/v1alpha2/aksnodeclass.go @@ -27,9 +27,6 @@ import ( // AKSNodeClassSpec is the top level specification for the AKS Karpenter Provider. // This will contain configuration necessary to launch instances in AKS. type AKSNodeClassSpec struct { - // AritfactStreamingEnabled is a flag to enable https://aka.ms/artifactstreaming - // +kubebuilder:default=false - ArtifactStreamingEnabled *bool `json:"artifactStreamingEnabled,omitempty"` // +kubebuilder:default=128 // +kubebuilder:validation:Minimum=100 // osDiskSizeGB is the size of the OS disk in GB. diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go index c0f0e2187..d44c9f2ea 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go @@ -385,7 +385,7 @@ var ( KubenetTemplate: base64.StdEncoding.EncodeToString(kubenetTemplate), // s ContainerdConfigContent: "", // kd IsKata: false, // n - ArtifactStreamingEnabled: false, // td + ArtifactStreamingEnabled: true, // td } ) @@ -449,7 +449,6 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) { nbv.GPUDriverVersion = a.GPUDriverVersion nbv.GPUImageSHA = a.GPUImageSHA } - nbv.ArtifactStreamingEnabled = a.ArtifactStreamingEnabled nbv.NeedsCgroupV2 = true // merge and stringify labels kubeletLabels := lo.Assign(kubeletNodeLabelsBase, a.Labels) diff --git a/pkg/providers/imagefamily/bootstrap/bootstrap.go b/pkg/providers/imagefamily/bootstrap/bootstrap.go index 5fb2c87c3..2bf1fab8d 100644 --- a/pkg/providers/imagefamily/bootstrap/bootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/bootstrap.go @@ -33,7 +33,6 @@ type Options struct { GPUDriverVersion string GPUImageSHA string SubnetID string - ArtifactStreamingEnabled bool } // Bootstrapper can be implemented to generate a bootstrap script diff --git a/pkg/providers/imagefamily/ubuntu_2204.go b/pkg/providers/imagefamily/ubuntu_2204.go index a2f59bea9..a7c3b8ee2 100644 --- a/pkg/providers/imagefamily/ubuntu_2204.go +++ b/pkg/providers/imagefamily/ubuntu_2204.go @@ -86,7 +86,6 @@ func (u Ubuntu2204) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta GPUDriverVersion: u.Options.GPUDriverVersion, GPUImageSHA: u.Options.GPUImageSHA, SubnetID: u.Options.SubnetID, - ArtifactStreamingEnabled: u.Options.ArtifactStreamingEnabled, }, Arch: u.Options.Arch, TenantID: u.Options.TenantID, diff --git a/pkg/providers/launchtemplate/launchtemplate.go b/pkg/providers/launchtemplate/launchtemplate.go index a03ca07e7..3c5e5ac95 100644 --- a/pkg/providers/launchtemplate/launchtemplate.go +++ b/pkg/providers/launchtemplate/launchtemplate.go @@ -133,7 +133,6 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp ClusterName: options.FromContext(ctx).ClusterName, ClusterEndpoint: p.clusterEndpoint, Tags: nodeClass.Spec.Tags, - ArtifactStreamingEnabled: lo.FromPtr(nodeClass.Spec.ArtifactStreamingEnabled), Labels: labels, CABundle: p.caBundle, Arch: arch, diff --git a/pkg/providers/launchtemplate/parameters/types.go b/pkg/providers/launchtemplate/parameters/types.go index bbec39e81..06e89c1aa 100644 --- a/pkg/providers/launchtemplate/parameters/types.go +++ b/pkg/providers/launchtemplate/parameters/types.go @@ -41,7 +41,6 @@ type StaticParameters struct { NetworkPolicy string KubernetesVersion string SubnetID string - ArtifactStreamingEnabled bool Tags map[string]string Labels map[string]string From 3dfb230dc70e17a2238371d56f8f11289b8b46d2 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Tue, 7 May 2024 16:36:26 +0000 Subject: [PATCH 6/6] chore: linting --- pkg/providers/imagefamily/bootstrap/aksbootstrap.go | 4 ++-- pkg/providers/launchtemplate/parameters/types.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go index d44c9f2ea..eff3d9411 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go @@ -219,7 +219,7 @@ type NodeBootstrapVariables struct { KubenetTemplate string // s static ContainerdConfigContent string // k determined by GPU VM size, WASM support, Kata support IsKata bool // n user-specified - ArtifactStreamingEnabled bool // t user-specified + ArtifactStreamingEnabled bool // t user-specified } var ( @@ -385,7 +385,7 @@ var ( KubenetTemplate: base64.StdEncoding.EncodeToString(kubenetTemplate), // s ContainerdConfigContent: "", // kd IsKata: false, // n - ArtifactStreamingEnabled: true, // td + ArtifactStreamingEnabled: true, // td } ) diff --git a/pkg/providers/launchtemplate/parameters/types.go b/pkg/providers/launchtemplate/parameters/types.go index 06e89c1aa..823d7d24c 100644 --- a/pkg/providers/launchtemplate/parameters/types.go +++ b/pkg/providers/launchtemplate/parameters/types.go @@ -40,7 +40,7 @@ type StaticParameters struct { NetworkPlugin string NetworkPolicy string KubernetesVersion string - SubnetID string + SubnetID string Tags map[string]string Labels map[string]string