From 568970410e4083dfc04e9ee4379de13c350de868 Mon Sep 17 00:00:00 2001
From: Bob <nyodas@users.noreply.github.com>
Date: Mon, 1 Feb 2021 19:07:00 +0100
Subject: [PATCH 1/6] [ci] - Add github action (#26)

Datadog: **NOT FROM UPSTREAM K8S**. From Datadog: https://github.com/datadog/kubernetes/pull/26

datadog:patch

[ghactions] Upgrade to checkout v3 + don't pull history (#56)

datadog:patch

[.github] - Update modules

Update modules to newer version.
This should fix issue with permissions and deprecations warnings.
Without this the github action was not able to create releases

datadog:patch
---
 .github/workflows/dd-build.yml | 135 +++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 .github/workflows/dd-build.yml

diff --git a/.github/workflows/dd-build.yml b/.github/workflows/dd-build.yml
new file mode 100644
index 0000000000000..e3d1d51d8b146
--- /dev/null
+++ b/.github/workflows/dd-build.yml
@@ -0,0 +1,135 @@
+name: Build and Push k8s Release
+
+on:
+  push:
+    # Sequence of patterns matched against refs/heads
+    tags:
+    # Push events on datadog tags
+    - "*-dd*"
+permissions: write-all
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        platform: ["linux/arm64","linux/amd64"]
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Go
+      uses: actions/setup-go@v5
+      with:
+        go-version: 1.22
+    - name: Set env
+      run: echo SANITIZED_TARGET_PLATFORM=${KUBE_BUILD_PLATFORM/\//-} >> $GITHUB_ENV
+      env:
+        KUBE_BUILD_PLATFORM: ${{ matrix.platform }}
+    - name: Cleanup disk space
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/share/boost
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/.ghcup
+    - name: Build
+      env:
+        KUBE_BUILD_PLATFORMS: ${{ matrix.platform }}
+        KUBE_RELEASE_RUN_TESTS: n
+      run: make quick-release KUBE_BUILD_PLATFORMS=$KUBE_BUILD_PLATFORMS
+    - name: Calculate checksums
+      id: calculate_checksums
+      shell: bash
+      working-directory: _output/release-tars
+      env:
+        KUBE_BUILD_PLATFORM: ${{ matrix.platform }}
+      run: |
+        TARGET_PLATFORM="${KUBE_BUILD_PLATFORM/\//-}"
+        for TARGET_FILE in *"${TARGET_PLATFORM}".tar.gz
+        do
+          sha256sum "$TARGET_FILE" > "${TARGET_FILE}.sha256sum"
+        done
+    - uses: actions/upload-artifact@v4
+      with:
+        name: k8s_output_${{ env.SANITIZED_TARGET_PLATFORM }}
+        path: _output/release-tars
+      env:
+        SANITIZED_TARGET_PLATFORM: ${{ env.SANITIZED_TARGET_PLATFORM }}
+  release:
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    needs: build
+    outputs:
+      upload_url: ${{ steps.create_release_branch.outputs.upload_url }}${{ steps.create_release_tags.outputs.upload_url }}
+    steps:
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      env:
+        GITHUB_REF: ${{ github.ref }}
+      if: startsWith(github.ref, 'refs/heads/')
+    - name: Create Release for Branch
+      id: create_release_branch
+      uses: softprops/action-gh-release@v2
+      if: startsWith(github.ref, 'refs/heads/')
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+        name: branch@${{ steps.extract_branch.outputs.branch  }}
+        tag_name: branch@${{ steps.extract_branch.outputs.branch  }}
+        draft: false
+        prerelease: false
+    - name: Extract tags name
+      shell: bash
+      run: echo "##[set-output name=tags;]$(echo ${GITHUB_REF#refs/tags/})"
+      id: extract_tags
+      env:
+        GITHUB_REF: ${{ github.ref }}
+      if: startsWith(github.ref, 'refs/tags/')
+    - name: Create Release for Tags
+      id: create_release_tags
+      uses: softprops/action-gh-release@v2
+      if: ${{ startsWith(github.ref, 'refs/tags/') }}
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+        name: ${{ steps.extract_tags.outputs.tags }}
+        tag_name: ${{ steps.extract_tags.outputs.tags }}
+        release_name: ${{ steps.extract_tags.outputs.tags }}
+        draft: false
+        prerelease: false
+  releaseassetsarm:
+    runs-on: ubuntu-latest
+    needs: release
+    strategy:
+      matrix:
+        assets: [
+           "kubernetes-client",
+           "kubernetes-node",
+           "kubernetes-server"
+        ]
+        platform: ["linux-arm64","linux-amd64"]
+        extension: ["tar.gz", "tar.gz.sha256sum"]
+    steps:
+    - uses: actions/download-artifact@v4
+      with:
+        name: k8s_output_${{ matrix.platform }}
+        path: _output/release-tars
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+    - name: Display structure of downloaded files
+      run: ls -R
+      working-directory: _output
+    - name: Upload Release Asset
+      id: upload-release-asset
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        upload_url: ${{ needs.release.outputs.upload_url }}
+        asset_path: ./_output/release-tars/${{ matrix.assets }}-${{ matrix.platform }}.${{ matrix.extension}}
+        asset_name: ${{ matrix.assets }}-${{ matrix.platform }}.${{ matrix.extension }}
+        asset_content_type: application/tar+gzip

From 5f85b61db523a988c92e2be2b4f292e3945bf7b6 Mon Sep 17 00:00:00 2001
From: bob <bob.bouteillier@datadoghq.com>
Date: Fri, 4 Oct 2024 17:24:50 +0200
Subject: [PATCH 2/6] Sidecar: Enable Lyft type sidecars

Datadog: **NOT FROM UPSTREAM K8S**. From Lyft: https://github.com/lyft/kubernetes/commit/266a18a5839fa7a5b588063749c60c2c326b1dd8

This commit is the combination of multiple commits we've done to cherry pick and bring lyft sidecars across the ages.

datadog:patch
---
 pkg/kubelet/kubelet.go                        |  14 +
 .../kuberuntime/kuberuntime_container.go      |  47 +-
 .../kuberuntime/kuberuntime_manager.go        |  82 ++-
 .../kuberuntime/kuberuntime_manager_test.go   | 569 +++++++++++++++++-
 .../kuberuntime_termination_order.go          |  68 +++
 pkg/kubelet/kuberuntime/labels.go             |  12 +-
 pkg/kubelet/status/status_manager.go          |  75 ++-
 test/e2e/node/pods.go                         | 567 +++--------------
 8 files changed, 932 insertions(+), 502 deletions(-)

diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index 600a006f0541e..8276ddbae91ea 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -123,6 +123,7 @@ import (
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/kubelet/userns"
 	"k8s.io/kubernetes/pkg/kubelet/util"
+	"k8s.io/kubernetes/pkg/kubelet/util/format"
 	"k8s.io/kubernetes/pkg/kubelet/util/manager"
 	"k8s.io/kubernetes/pkg/kubelet/util/queue"
 	"k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
@@ -2900,6 +2901,8 @@ func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) {
 		// TODO: reconcile being calculated in the config manager is questionable, and avoiding
 		// extra syncs may no longer be necessary. Reevaluate whether Reconcile and Sync can be
 		// merged (after resolving the next two TODOs).
+		sidecarsStatus := status.GetSidecarsStatus(pod)
+		klog.Infof("Pod: %s, status: Present=%v,Ready=%v,ContainersWaiting=%v", format.Pod(pod), sidecarsStatus.SidecarsPresent, sidecarsStatus.SidecarsReady, sidecarsStatus.ContainersWaiting)
 
 		// Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation.
 		// TODO: this should be unnecessary today - determine what is the cause for this to
@@ -2912,6 +2915,17 @@ func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) {
 				UpdateType: kubetypes.SyncPodSync,
 				StartTime:  start,
 			})
+		} else if sidecarsStatus.ContainersWaiting {
+			// if containers aren't running and the sidecars are all ready trigger a sync so that the containers get started
+			if sidecarsStatus.SidecarsPresent && sidecarsStatus.SidecarsReady {
+				klog.Infof("Pod: %s: sidecars: sidecars are ready, dispatching work", format.Pod(pod))
+				kl.podWorkers.UpdatePod(UpdatePodOptions{
+					Pod:        pod,
+					MirrorPod:  mirrorPod,
+					UpdateType: kubetypes.SyncPodSync,
+					StartTime:  start,
+				})
+			}
 		}
 
 		// After an evicted pod is synced, all dead containers in the pod can be removed.
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go
index a1d9453f8918d..ee3427e22110f 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go
@@ -786,6 +786,12 @@ func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(ctx context.
 
 	l := getContainerInfoFromLabels(ctx, s.Labels)
 	a := getContainerInfoFromAnnotations(ctx, s.Annotations)
+
+	annotations := make(map[string]string)
+	if a.Sidecar {
+		annotations[fmt.Sprintf("sidecars.lyft.net/container-lifecycle-%s", l.ContainerName)] = "Sidecar"
+	}
+
 	// Notice that the followings are not full spec. The container killing code should not use
 	// un-restored fields.
 	pod = &v1.Pod{
@@ -794,6 +800,7 @@ func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(ctx context.
 			Name:                       l.PodName,
 			Namespace:                  l.PodNamespace,
 			DeletionGracePeriodSeconds: a.PodDeletionGracePeriod,
+			Annotations:                annotations,
 		},
 		Spec: v1.PodSpec{
 			TerminationGracePeriodSeconds: a.PodTerminationGracePeriod,
@@ -820,8 +827,15 @@ func (m *kubeGenericRuntimeManager) killContainer(ctx context.Context, pod *v1.P
 	var containerSpec *v1.Container
 	if pod != nil {
 		if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil {
-			return fmt.Errorf("failed to get containerSpec %q (id=%q) in pod %q when killing container for reason %q",
-				containerName, containerID.String(), format.Pod(pod), message)
+			// after a kubelet restart, it's not 100% certain that the
+			// pod we're given has the container we need in the spec
+			// -- we try to recover that here.
+			restoredPod, restoredContainer, err := m.restoreSpecsFromContainerLabels(ctx, containerID)
+			if err != nil {
+				return fmt.Errorf("failed to get containerSpec %q(id=%q) in pod %q when killing container for reason %q. error: %v",
+					containerName, containerID.String(), format.Pod(pod), message, err)
+			}
+			pod, containerSpec = restoredPod, restoredContainer
 		}
 	} else {
 		// Restore necessary information if one of the specs is nil.
@@ -885,9 +899,27 @@ func (m *kubeGenericRuntimeManager) killContainer(ctx context.Context, pod *v1.P
 // killContainersWithSyncResult kills all pod's containers with sync results.
 func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(ctx context.Context, pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) {
 	logger := klog.FromContext(ctx)
+	// split out sidecars and non-sidecars
+	var (
+		sidecars    []*kubecontainer.Container
+		nonSidecars []*kubecontainer.Container
+	)
+
+	if gracePeriodOverride == nil {
+		minGracePeriod := int64(minimumGracePeriodInSeconds)
+		gracePeriodOverride = &minGracePeriod
+	}
+
+	for _, container := range runningPod.Containers {
+		if isSidecar(pod, container.Name) {
+			sidecars = append(sidecars, container)
+		} else {
+			nonSidecars = append(nonSidecars, container)
+		}
+	}
 	containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
-	wg := sync.WaitGroup{}
 
+	wg := sync.WaitGroup{}
 	wg.Add(len(runningPod.Containers))
 	var termOrdering *terminationOrdering
 	if types.HasRestartableInitContainer(pod) {
@@ -897,6 +929,15 @@ func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(ctx context.Con
 		}
 		termOrdering = newTerminationOrdering(pod, runningContainerNames)
 	}
+
+	if len(sidecars) > 0 {
+		var runningContainerNames []string
+		for _, container := range runningPod.Containers {
+			runningContainerNames = append(runningContainerNames, container.Name)
+		}
+		termOrdering = lyftSidecarTerminationOrdering(pod, runningContainerNames)
+	}
+
 	for _, container := range runningPod.Containers {
 		go func(container *kubecontainer.Container) {
 			defer utilruntime.HandleCrash()
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
index a2a112c4bc3e8..3383b96c3555d 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@@ -65,6 +65,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/metrics"
 	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
 	"k8s.io/kubernetes/pkg/kubelet/runtimeclass"
+	"k8s.io/kubernetes/pkg/kubelet/status"
 	"k8s.io/kubernetes/pkg/kubelet/sysctl"
 	"k8s.io/kubernetes/pkg/kubelet/token"
 	"k8s.io/kubernetes/pkg/kubelet/types"
@@ -610,6 +611,14 @@ func containerResourcesFromRequirements(requirements *v1.ResourceRequirements) c
 	}
 }
 
+func isSidecar(pod *v1.Pod, containerName string) bool {
+	if pod == nil {
+		klog.V(5).Infof("isSidecar: pod is nil, so returning false")
+		return false
+	}
+	return pod.Annotations[fmt.Sprintf("sidecars.lyft.net/container-lifecycle-%s", containerName)] == "Sidecar"
+}
+
 // computePodResizeAction determines the actions required (if any) to resize the given container.
 // Returns whether to keep (true) or restart (false) the container.
 // TODO(vibansal): Make this function to be agnostic to whether it is dealing with a restartable init container or not (i.e. remove the argument `isRestartableInitContainer`).
@@ -1008,6 +1017,17 @@ func (m *kubeGenericRuntimeManager) computePodActions(ctx context.Context, pod *
 		ContainersToKill:  make(map[kubecontainer.ContainerID]containerToKillInfo),
 	}
 
+	var sidecarNames []string
+	for _, container := range pod.Spec.Containers {
+		if isSidecar(pod, container.Name) {
+			sidecarNames = append(sidecarNames, container.Name)
+		}
+	}
+
+	// determine sidecar status
+	sidecarStatus := status.GetSidecarsStatus(pod)
+	klog.Infof("Pod: %s, sidecars: %s, status: Present=%v,Ready=%v,ContainersWaiting=%v", format.Pod(pod), sidecarNames, sidecarStatus.SidecarsPresent, sidecarStatus.SidecarsReady, sidecarStatus.ContainersWaiting)
+
 	// If we need to (re-)create the pod sandbox, everything will need to be
 	// killed and recreated, and init containers should be purged.
 	if createPodSandbox {
@@ -1067,7 +1087,22 @@ func (m *kubeGenericRuntimeManager) computePodActions(ctx context.Context, pod *
 
 			return changes
 		}
-		changes.ContainersToStart = containersToStart
+		if len(sidecarNames) > 0 {
+			for idx, c := range pod.Spec.Containers {
+				if isSidecar(pod, c.Name) {
+					changes.ContainersToStart = append(changes.ContainersToStart, idx)
+				}
+			}
+			return changes
+		}
+		// Start all containers by default but exclude the ones that
+		// succeeded if RestartPolicy is OnFailure
+		for idx, c := range pod.Spec.Containers {
+			if containerSucceeded(&c, podStatus) && pod.Spec.RestartPolicy == v1.RestartPolicyOnFailure {
+				continue
+			}
+			changes.ContainersToStart = append(changes.ContainersToStart, idx)
+		}
 		return changes
 	}
 
@@ -1099,6 +1134,21 @@ func (m *kubeGenericRuntimeManager) computePodActions(ctx context.Context, pod *
 	keepCount := 0
 	// check the status of containers.
 	for idx, container := range pod.Spec.Containers {
+		// this works because in other cases, if it was a sidecar, we
+		// are always allowed to handle the container.
+		//
+		// if it is a non-sidecar, and there are no sidecars, then
+		// we're are also always allowed to restart the container.
+		//
+		// if there are sidecars, then we can only restart non-sidecars under
+		// the following conditions:
+		// - the non-sidecars have run before (i.e. they are not in a Waiting state) OR
+		// - the sidecars are ready (we're starting them for the first time)
+		if !isSidecar(pod, container.Name) && sidecarStatus.SidecarsPresent && sidecarStatus.ContainersWaiting && !sidecarStatus.SidecarsReady {
+			klog.Infof("Pod: %s, Container: %s, sidecar=%v skipped: Present=%v,Ready=%v,ContainerWaiting=%v", format.Pod(pod), container.Name, isSidecar(pod, container.Name), sidecarStatus.SidecarsPresent, sidecarStatus.SidecarsReady, sidecarStatus.ContainersWaiting)
+			continue
+		}
+
 		containerStatus := podStatus.FindContainerStatusByName(container.Name)
 
 		// Call internal container post-stop lifecycle hook for any non-running container so that any
@@ -1183,6 +1233,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(ctx context.Context, pod *
 	}
 
 	if keepCount == 0 && len(changes.ContainersToStart) == 0 {
+		klog.Infof("Pod: %s: KillPod=true", format.Pod(pod))
 		changes.KillPod = true
 		// To prevent the restartable init containers to keep pod alive, we should
 		// not restart them.
@@ -1642,6 +1693,35 @@ func (m *kubeGenericRuntimeManager) doBackOff(ctx context.Context, pod *v1.Pod,
 // only hard kill paths are allowed to specify a gracePeriodOverride in the kubelet in order to not corrupt user data.
 // it is useful when doing SIGKILL for hard eviction scenarios, or max grace period during soft eviction scenarios.
 func (m *kubeGenericRuntimeManager) KillPod(ctx context.Context, pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error {
+	// if the pod is nil, we need to recover it, so we can get the
+	// grace period and also the sidecar status.
+	if pod == nil {
+		for _, container := range runningPod.Containers {
+			klog.Infof("Pod: %s, KillPod: pod nil, trying to restore from container %s", runningPod.Name, container.ID)
+			podSpec, _, err := m.restoreSpecsFromContainerLabels(ctx, container.ID)
+			if err != nil {
+				klog.Errorf("Pod: %s, KillPod: couldn't restore: %s", runningPod.Name, container.ID)
+				continue
+			}
+			pod = podSpec
+			break
+		}
+	}
+
+	if gracePeriodOverride == nil && pod != nil {
+		switch {
+		case pod.DeletionGracePeriodSeconds != nil:
+			gracePeriodOverride = pod.DeletionGracePeriodSeconds
+		case pod.Spec.TerminationGracePeriodSeconds != nil:
+			gracePeriodOverride = pod.Spec.TerminationGracePeriodSeconds
+		}
+	}
+
+	if gracePeriodOverride == nil || *gracePeriodOverride < minimumGracePeriodInSeconds {
+		min := int64(minimumGracePeriodInSeconds)
+		gracePeriodOverride = &min
+	}
+
 	err := m.killPodWithSyncResult(ctx, pod, runningPod, gracePeriodOverride)
 	return err.Error()
 }
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
index 4dc328bb4998b..fbbc2b5929b76 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
@@ -1155,29 +1155,32 @@ func TestComputePodActions(t *testing.T) {
 				ContainersToKill:  map[kubecontainer.ContainerID]containerToKillInfo{},
 			},
 		},
-		"Verify we do not create a pod sandbox if no ready sandbox for pod with RestartPolicy=OnFailure and all containers succeeded": {
-			mutatePodFn: func(pod *v1.Pod) {
-				pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure
-			},
-			mutateStatusFn: func(status *kubecontainer.PodStatus) {
-				// no ready sandbox
-				status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
-				status.SandboxStatuses[0].Metadata.Attempt = uint32(1)
-				// all containers succeeded
-				for i := range status.ContainerStatuses {
-					status.ContainerStatuses[i].State = kubecontainer.ContainerStateExited
-					status.ContainerStatuses[i].ExitCode = 0
-				}
-			},
-			actions: podActions{
-				SandboxID:         baseStatus.SandboxStatuses[0].Id,
-				Attempt:           uint32(2),
-				CreateSandbox:     false,
-				KillPod:           true,
-				ContainersToStart: []int{},
-				ContainersToKill:  map[kubecontainer.ContainerID]containerToKillInfo{},
+		// For now the a2a9964a66ef481d10db3ffc15d4b26a234d0bdd fix isn't compatible with the sidecar patchset we keep in this fork. Comment out related tests.
+		/*
+			"Verify we do not create a pod sandbox if no ready sandbox for pod with RestartPolicy=OnFailure and all containers succeeded": {
+				mutatePodFn: func(pod *v1.Pod) {
+					pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure
+				},
+				mutateStatusFn: func(status *kubecontainer.PodStatus) {
+					// no ready sandbox
+					status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+					status.SandboxStatuses[0].Metadata.Attempt = uint32(1)
+					// all containers succeeded
+					for i := range status.ContainerStatuses {
+						status.ContainerStatuses[i].State = kubecontainer.ContainerStateExited
+						status.ContainerStatuses[i].ExitCode = 0
+					}
+				},
+				actions: podActions{
+					SandboxID:         baseStatus.SandboxStatuses[0].Id,
+					Attempt:           uint32(2),
+					CreateSandbox:     false,
+					KillPod:           true,
+					ContainersToStart: []int{},
+					ContainersToKill:  map[kubecontainer.ContainerID]containerToKillInfo{},
+				},
 			},
-		},
+		*/
 		"Verify we create a pod sandbox if no ready sandbox for pod with RestartPolicy=Never and no containers have ever been created": {
 			mutatePodFn: func(pod *v1.Pod) {
 				pod.Spec.RestartPolicy = v1.RestartPolicyNever
@@ -1500,8 +1503,251 @@ func TestComputePodActionsWithInitContainers(t *testing.T) {
 			if test.mutateStatusFn != nil {
 				test.mutateStatusFn(status)
 			}
-			tCtx := ktesting.Init(t)
-			actions := m.computePodActions(tCtx, pod, status)
+			ctx := context.Background()
+			actions := m.computePodActions(ctx, pod, status)
+			verifyActions(t, &test.actions, &actions, desc)
+		})
+	}
+}
+
+func TestComputePodActionsWithInitContainersWithLegacySidecarContainers(t *testing.T) {
+	_, _, m, err := createTestRuntimeManager()
+	require.NoError(t, err)
+
+	// Creating a pair reference pod and status for the test cases to refer
+	// the specific fields.
+	basePod, baseStatus := makeBasePodAndStatusWithInitContainers()
+	noAction := podActions{
+		SandboxID:         baseStatus.SandboxStatuses[0].Id,
+		ContainersToStart: []int{},
+		ContainersToKill:  map[kubecontainer.ContainerID]containerToKillInfo{},
+	}
+
+	for desc, test := range map[string]struct {
+		mutatePodFn    func(*v1.Pod)
+		mutateStatusFn func(*kubecontainer.PodStatus)
+		actions        podActions
+	}{
+		"initialization completed; start all containers": {
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{0, 1, 2},
+				ContainersToKill:  getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"no init containers have been started; start the first one": {
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses = nil
+			},
+			actions: podActions{
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+				InitContainersToStart:    []int{0},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"initialization in progress; do nothing": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateRunning
+			},
+			actions: noAction,
+		},
+		"Kill pod and restart the first init container if the pod sandbox is dead": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+			},
+			actions: podActions{
+				KillPod:                  true,
+				CreateSandbox:            true,
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				Attempt:                  uint32(1),
+				NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+				InitContainersToStart:    []int{0},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"initialization failed; restart the last init container if RestartPolicy == Always": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].ExitCode = 137
+			},
+			actions: podActions{
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[2],
+				InitContainersToStart:    []int{2},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"initialization failed; restart the last init container if RestartPolicy == OnFailure": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].ExitCode = 137
+			},
+			actions: podActions{
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[2],
+				InitContainersToStart:    []int{2},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"initialization failed; kill pod if RestartPolicy == Never": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyNever },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].ExitCode = 137
+			},
+			actions: podActions{
+				KillPod:           true,
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{},
+				ContainersToKill:  getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"init container state unknown; kill and recreate the last init container if RestartPolicy == Always": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateUnknown
+			},
+			actions: podActions{
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[2],
+				InitContainersToStart:    []int{2},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{2}),
+			},
+		},
+		"init container state unknown; kill and recreate the last init container if RestartPolicy == OnFailure": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateUnknown
+			},
+			actions: podActions{
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[2],
+				InitContainersToStart:    []int{2},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{2}),
+			},
+		},
+		"init container state unknown; kill pod if RestartPolicy == Never": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyNever },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateUnknown
+			},
+			actions: podActions{
+				KillPod:           true,
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{},
+				ContainersToKill:  getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"Pod sandbox not ready, init container failed, but RestartPolicy == Never; kill pod only": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyNever },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+			},
+			actions: podActions{
+				KillPod:           true,
+				CreateSandbox:     false,
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				Attempt:           uint32(1),
+				ContainersToStart: []int{},
+				ContainersToKill:  getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		// For now the a2a9964a66ef481d10db3ffc15d4b26a234d0bdd fix isn't compatible with the sidecar patchset we keep in this fork. Comment out related tests.
+		/*
+			"Pod sandbox not ready, and RestartPolicy == Never, but no visible init containers;  create a new pod sandbox": {
+				mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyNever },
+				mutateStatusFn: func(status *kubecontainer.PodStatus) {
+					status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+					status.ContainerStatuses = []*kubecontainer.Status{}
+				},
+				actions: podActions{
+					KillPod:                  true,
+					CreateSandbox:            true,
+					SandboxID:                baseStatus.SandboxStatuses[0].Id,
+					Attempt:                  uint32(1),
+					NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+					InitContainersToStart:    []int{0},
+					ContainersToStart:        []int{},
+					ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+				},
+			},
+			"Pod sandbox not ready, init container failed, and RestartPolicy == OnFailure; create a new pod sandbox": {
+				mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure },
+				mutateStatusFn: func(status *kubecontainer.PodStatus) {
+					status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+					status.ContainerStatuses[2].ExitCode = 137
+				},
+				actions: podActions{
+					KillPod:                  true,
+					CreateSandbox:            true,
+					SandboxID:                baseStatus.SandboxStatuses[0].Id,
+					Attempt:                  uint32(1),
+					NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+					InitContainersToStart:    []int{0},
+					ContainersToStart:        []int{},
+					ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+				},
+			},
+		*/
+		"some of the init container statuses are missing but the last init container is running, don't restart preceding ones": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateRunning
+				status.ContainerStatuses = status.ContainerStatuses[2:]
+			},
+			actions: podActions{
+				KillPod:           false,
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{},
+				ContainersToKill:  getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"an init container is in the created state due to an unknown error when starting container; restart it": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[2].State = kubecontainer.ContainerStateCreated
+			},
+			actions: podActions{
+				KillPod:                  false,
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				NextInitContainerToStart: &basePod.Spec.InitContainers[2],
+				InitContainersToStart:    []int{2},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+	} {
+		t.Run(desc, func(t *testing.T) {
+			featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LegacySidecarContainers, true)
+			pod, status := makeBasePodAndStatusWithInitContainers()
+			if test.mutatePodFn != nil {
+				test.mutatePodFn(pod)
+			}
+			if test.mutateStatusFn != nil {
+				test.mutateStatusFn(status)
+			}
+			ctx := context.Background()
+			actions := m.computePodActions(ctx, pod, status)
+			handleRestartableInitContainers := kubelettypes.HasRestartableInitContainer(pod)
+			if !handleRestartableInitContainers {
+				// If sidecar containers are disabled or the pod does not have any
+				// restartable init container, we should not see any
+				// InitContainersToStart in the actions.
+				test.actions.InitContainersToStart = nil
+			} else {
+				// If sidecar containers are enabled and the pod has any
+				// restartable init container, we should not see any
+				// NextInitContainerToStart in the actions.
+				test.actions.NextInitContainerToStart = nil
+			}
 			verifyActions(t, &test.actions, &actions, desc)
 		})
 	}
@@ -1961,6 +2207,13 @@ func makeBasePodAndStatusWithRestartableInitContainers() (*v1.Pod, *kubecontaine
 	return pod, status
 }
 
+func makeBasePodAndStatusWithSidecar() (*v1.Pod, *kubecontainer.PodStatus) {
+	pod, status := makeBasePodAndStatus()
+	pod.Annotations = map[string]string{fmt.Sprintf("sidecars.lyft.net/container-lifecycle-%s", pod.Spec.Containers[1].Name): "Sidecar"}
+	status.ContainerStatuses[1].Hash = kubecontainer.HashContainer(&pod.Spec.Containers[1])
+	return pod, status
+}
+
 func TestComputePodActionsWithInitAndEphemeralContainers(t *testing.T) {
 	// Make sure existing test cases pass with feature enabled
 	TestComputePodActions(t)
@@ -2139,7 +2392,97 @@ func TestComputePodActionsWithContainerRestartRules(t *testing.T) {
 		actions        podActions
 		resetStatusFn  func(*kubecontainer.PodStatus)
 	}{
-		"restart exited containers if RestartPolicy == Always": {
+		"steady state; do nothing; ignore ephemeral container": {
+			actions: noAction,
+		},
+		"No ephemeral containers running; start one": {
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses = status.ContainerStatuses[:4]
+			},
+			actions: podActions{
+				SandboxID:                  baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart:          []int{},
+				ContainersToKill:           map[kubecontainer.ContainerID]containerToKillInfo{},
+				EphemeralContainersToStart: []int{0},
+			},
+		},
+		"Start second ephemeral container": {
+			mutatePodFn: func(pod *v1.Pod) {
+				pod.Spec.EphemeralContainers = append(pod.Spec.EphemeralContainers, v1.EphemeralContainer{
+					EphemeralContainerCommon: v1.EphemeralContainerCommon{
+						Name:  "debug2",
+						Image: "busybox",
+					},
+				})
+			},
+			actions: podActions{
+				SandboxID:                  baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart:          []int{},
+				ContainersToKill:           map[kubecontainer.ContainerID]containerToKillInfo{},
+				EphemeralContainersToStart: []int{1},
+			},
+		},
+		"Ephemeral container exited; do not restart": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[4].State = kubecontainer.ContainerStateExited
+			},
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{},
+				ContainersToKill:  map[kubecontainer.ContainerID]containerToKillInfo{},
+			},
+		},
+		"initialization in progress; start ephemeral container": {
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.ContainerStatuses[3].State = kubecontainer.ContainerStateRunning
+				status.ContainerStatuses = status.ContainerStatuses[:4]
+			},
+			actions: podActions{
+				SandboxID:                  baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart:          []int{},
+				ContainersToKill:           map[kubecontainer.ContainerID]containerToKillInfo{},
+				EphemeralContainersToStart: []int{0},
+			},
+		},
+		// For now the a2a9964a66ef481d10db3ffc15d4b26a234d0bdd fix isn't compatible with the sidecar patchset we keep in this fork. Comment out related tests.
+		/*
+			"Create a new pod sandbox if the pod sandbox is dead, init container failed and RestartPolicy == OnFailure": {
+				mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyOnFailure },
+				mutateStatusFn: func(status *kubecontainer.PodStatus) {
+					status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+					status.ContainerStatuses = status.ContainerStatuses[3:]
+					status.ContainerStatuses[0].ExitCode = 137
+				},
+				actions: podActions{
+					KillPod:                  true,
+					CreateSandbox:            true,
+					SandboxID:                baseStatus.SandboxStatuses[0].Id,
+					Attempt:                  uint32(1),
+					NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+					InitContainersToStart:    []int{0},
+					ContainersToStart:        []int{},
+					ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+				},
+			},
+		*/
+		"Kill pod and do not restart ephemeral container if the pod sandbox is dead": {
+			mutatePodFn: func(pod *v1.Pod) { pod.Spec.RestartPolicy = v1.RestartPolicyAlways },
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				status.SandboxStatuses[0].State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
+			},
+			actions: podActions{
+				KillPod:                  true,
+				CreateSandbox:            true,
+				SandboxID:                baseStatus.SandboxStatuses[0].Id,
+				Attempt:                  uint32(1),
+				NextInitContainerToStart: &basePod.Spec.InitContainers[0],
+				InitContainersToStart:    []int{0},
+				ContainersToStart:        []int{},
+				ContainersToKill:         getKillMapWithInitContainers(basePod, baseStatus, []int{}),
+			},
+		},
+		"Kill pod if all containers exited except ephemeral container": {
 			mutatePodFn: func(pod *v1.Pod) {
 				pod.Spec.Containers[0].RestartPolicy = &containerRestartPolicyAlways
 				pod.Spec.Containers[1].RestartPolicy = &containerRestartPolicyAlways
@@ -2289,6 +2632,182 @@ func TestSyncPodWithSandboxAndDeletedPod(t *testing.T) {
 	assert.NoError(t, result.Error())
 }
 
+func TestComputePodActionsWithSidecar(t *testing.T) {
+	_, _, m, err := createTestRuntimeManager()
+	require.NoError(t, err)
+
+	// Createing a pair reference pod and status for the test cases to refer
+	// the specific fields.
+	basePod, baseStatus := makeBasePodAndStatusWithSidecar()
+	for desc, test := range map[string]struct {
+		mutatePodFn    func(*v1.Pod)
+		mutateStatusFn func(*kubecontainer.PodStatus)
+		actions        podActions
+	}{
+		"Start sidecar containers before non-sidecars when creating a new pod": {
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				// No container or sandbox exists.
+				status.SandboxStatuses = []*runtimeapi.PodSandboxStatus{}
+				status.ContainerStatuses = []*kubecontainer.Status{}
+			},
+			actions: podActions{
+				KillPod:           true,
+				CreateSandbox:     true,
+				Attempt:           uint32(0),
+				ContainersToStart: []int{1},
+				ContainersToKill:  getKillMap(basePod, baseStatus, []int{}),
+			},
+		},
+		"Don't start non-sidecars until sidecars are ready": {
+			mutatePodFn: func(pod *v1.Pod) {
+				pod.Status.ContainerStatuses = []v1.ContainerStatus{
+					{
+						Name: "foo1",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+					{
+						Name:  "foo2",
+						Ready: false,
+					},
+					{
+						Name: "foo3",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+				}
+			},
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				for i := range status.ContainerStatuses {
+					if i == 1 {
+						continue
+					}
+					status.ContainerStatuses[i].State = ""
+				}
+			},
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{},
+				ContainersToKill:  getKillMap(basePod, baseStatus, []int{}),
+			},
+		},
+		"Start non-sidecars when sidecars are ready": {
+			mutatePodFn: func(pod *v1.Pod) {
+				pod.Status.ContainerStatuses = []v1.ContainerStatus{
+					{
+						Name: "foo1",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+					{
+						Name:  "foo2",
+						Ready: true,
+					},
+					{
+						Name: "foo3",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+				}
+			},
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				for i := range status.ContainerStatuses {
+					if i == 1 {
+						continue
+					}
+					status.ContainerStatuses[i].State = ""
+				}
+			},
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{0, 2},
+				ContainersToKill:  getKillMap(basePod, baseStatus, []int{}),
+			},
+		},
+		"Restart only sidecars while non-sidecars are waiting": {
+			mutatePodFn: func(pod *v1.Pod) {
+				pod.Spec.RestartPolicy = v1.RestartPolicyAlways
+				pod.Status.ContainerStatuses = []v1.ContainerStatus{
+					{
+						Name: "foo1",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+					{
+						Name:  "foo2",
+						Ready: false,
+					},
+					{
+						Name: "foo3",
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{},
+						},
+					},
+				}
+			},
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				for i := range status.ContainerStatuses {
+					if i == 1 {
+						status.ContainerStatuses[i].State = kubecontainer.ContainerStateExited
+					}
+					status.ContainerStatuses[i].State = ""
+				}
+			},
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{1},
+				ContainersToKill:  getKillMap(basePod, baseStatus, []int{}),
+			},
+		},
+		"Restart running non-sidecars despite sidecar becoming not ready ": {
+			mutatePodFn: func(pod *v1.Pod) {
+				pod.Spec.RestartPolicy = v1.RestartPolicyAlways
+				pod.Status.ContainerStatuses = []v1.ContainerStatus{
+					{
+						Name: "foo1",
+					},
+					{
+						Name:  "foo2",
+						Ready: false,
+					},
+					{
+						Name: "foo3",
+					},
+				}
+			},
+			mutateStatusFn: func(status *kubecontainer.PodStatus) {
+				for i := range status.ContainerStatuses {
+					if i == 1 {
+						continue
+					}
+					status.ContainerStatuses[i].State = kubecontainer.ContainerStateExited
+				}
+			},
+			actions: podActions{
+				SandboxID:         baseStatus.SandboxStatuses[0].Id,
+				ContainersToStart: []int{0, 2},
+				ContainersToKill:  getKillMap(basePod, baseStatus, []int{}),
+			},
+		},
+	} {
+		pod, status := makeBasePodAndStatusWithSidecar()
+		if test.mutatePodFn != nil {
+			test.mutatePodFn(pod)
+		}
+		if test.mutateStatusFn != nil {
+			test.mutateStatusFn(status)
+		}
+		ctx := context.Background()
+		actions := m.computePodActions(ctx, pod, status)
+		verifyActions(t, &test.actions, &actions, desc)
+	}
+}
+
 func makeBasePodAndStatusWithInitAndEphemeralContainers() (*v1.Pod, *kubecontainer.PodStatus) {
 	pod, status := makeBasePodAndStatus()
 	pod.Spec.InitContainers = []v1.Container{
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_termination_order.go b/pkg/kubelet/kuberuntime/kuberuntime_termination_order.go
index eee7c869b238a..520b6ec755357 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_termination_order.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_termination_order.go
@@ -21,6 +21,7 @@ import (
 	"time"
 
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
 
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 )
@@ -94,6 +95,71 @@ func newTerminationOrdering(pod *v1.Pod, runningContainerNames []string) *termin
 	return to
 }
 
+// lyftSidecarTerminationOrdering constructs a terminationOrdering based on the pod spec and the currently running containers.
+// lyftSidecar will always be last to terminate.
+// We assume that the nativeSidecar is always true when using the lyft Sidecar
+func lyftSidecarTerminationOrdering(pod *v1.Pod, runningContainerNames []string) *terminationOrdering {
+	to := &terminationOrdering{
+		prereqs:    map[string][]chan struct{}{},
+		terminated: map[string]chan struct{}{},
+	}
+
+	runningContainers := map[string]struct{}{}
+	for _, name := range runningContainerNames {
+		runningContainers[name] = struct{}{}
+	}
+
+	var mainContainerChannels []chan struct{}
+	var cleanContainerChannels []chan struct{}
+
+	// sidecar containers need to wait on main containers, so we create a channel per main container
+	// for them to wait on
+	for _, c := range pod.Spec.Containers {
+		channel := make(chan struct{})
+		to.terminated[c.Name] = channel
+		mainContainerChannels = append(mainContainerChannels, channel)
+
+		// If it's a lyft sidecar, we want to wait for all other containers to finish
+		if !isSidecar(pod, c.Name) {
+			cleanContainerChannels = append(cleanContainerChannels, channel)
+		} else {
+			to.prereqs[c.Name] = append(to.prereqs[c.Name], cleanContainerChannels...)
+		}
+		// if it's not a running container, pre-close the channel so nothing
+		// waits on it
+		if _, isRunning := runningContainers[c.Name]; !isRunning {
+			close(channel)
+		}
+	}
+
+	var previousSidecarName string
+	for i := range pod.Spec.InitContainers {
+		// get the init containers in reverse order
+		ic := pod.Spec.InitContainers[len(pod.Spec.InitContainers)-i-1]
+
+		channel := make(chan struct{})
+		to.terminated[ic.Name] = channel
+
+		// if it's not a running container, pre-close the channel so nothing
+		// waits on it
+		if _, isRunning := runningContainers[ic.Name]; !isRunning {
+			close(channel)
+		}
+
+		if podutil.IsRestartableInitContainer(&ic) {
+			// sidecars need to wait for all main containers to exit
+			to.prereqs[ic.Name] = append(to.prereqs[ic.Name], mainContainerChannels...)
+
+			// if there is a later sidecar, this container needs to wait for it to finish
+			if previousSidecarName != "" {
+				to.prereqs[ic.Name] = append(to.prereqs[ic.Name], to.terminated[previousSidecarName])
+			}
+			previousSidecarName = ic.Name
+		}
+	}
+	return to
+}
+
 // waitForTurn waits until it is time for the container with the specified name to begin terminating, up until
 // the specified grace period.  If gracePeriod = 0, there is no wait.
 func (o *terminationOrdering) waitForTurn(name string, gracePeriod int64) float64 {
@@ -110,10 +176,12 @@ func (o *terminationOrdering) waitForTurn(name string, gracePeriod int64) float6
 		case <-c:
 		case <-remainingGrace.C:
 			// grace period expired, so immediately exit
+			klog.V(3).InfoS("Sidecar container is ready to terminate, grace period is expired", "container", name, "gracePeriod", gracePeriod)
 			return time.Since(start).Seconds()
 		}
 	}
 
+	klog.V(3).InfoS("Container is ready to terminate", "container", name)
 	return time.Since(start).Seconds()
 }
 
diff --git a/pkg/kubelet/kuberuntime/labels.go b/pkg/kubelet/kuberuntime/labels.go
index cef97ec0511bc..383a8b8e22cea 100644
--- a/pkg/kubelet/kuberuntime/labels.go
+++ b/pkg/kubelet/kuberuntime/labels.go
@@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+	http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -39,6 +39,7 @@ const (
 	containerTerminationMessagePolicyLabel = "io.kubernetes.container.terminationMessagePolicy"
 	containerPreStopHandlerLabel           = "io.kubernetes.container.preStopHandler"
 	containerPortsLabel                    = "io.kubernetes.container.ports"
+	containerSidecarLabel                  = "com.lyft.sidecars.container-lifecycle"
 )
 
 type labeledPodSandboxInfo struct {
@@ -64,6 +65,7 @@ type labeledContainerInfo struct {
 type annotatedContainerInfo struct {
 	Hash                      uint64
 	RestartCount              int
+	Sidecar                   bool
 	PodDeletionGracePeriod    *int64
 	PodTerminationGracePeriod *int64
 	TerminationMessagePath    string
@@ -119,6 +121,11 @@ func newContainerAnnotations(ctx context.Context, container *v1.Container, pod *
 	annotations[containerTerminationMessagePathLabel] = container.TerminationMessagePath
 	annotations[containerTerminationMessagePolicyLabel] = string(container.TerminationMessagePolicy)
 
+	annotations[containerSidecarLabel] = "Default"
+	if isSidecar(pod, container.Name) {
+		annotations[containerSidecarLabel] = "Sidecar"
+	}
+
 	if pod.DeletionGracePeriodSeconds != nil {
 		annotations[podDeletionGracePeriodLabel] = strconv.FormatInt(*pod.DeletionGracePeriodSeconds, 10)
 	}
@@ -207,6 +214,9 @@ func getContainerInfoFromAnnotations(ctx context.Context, annotations map[string
 	if containerInfo.PodTerminationGracePeriod, err = getInt64PointerFromLabel(logger, annotations, podTerminationGracePeriodLabel); err != nil {
 		logger.Error(err, "Unable to get label value from annotations", "label", podTerminationGracePeriodLabel, "annotations", annotations)
 	}
+	if getStringValueFromLabel(logger, annotations, containerSidecarLabel) == "Sidecar" {
+		containerInfo.Sidecar = true
+	}
 
 	preStopHandler := &v1.LifecycleHandler{}
 	if found, err := getJSONObjectFromLabel(logger, annotations, containerPreStopHandlerLabel, preStopHandler); err != nil {
diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go
index 236e06576e7e6..c642a7b7714bf 100644
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+	http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,13 +35,14 @@ import (
 	"k8s.io/apimachinery/pkg/util/diff"
 	"k8s.io/apimachinery/pkg/util/wait"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
-	"k8s.io/klog/v2"
+	klog "k8s.io/klog/v2"
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/features"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 	"k8s.io/kubernetes/pkg/kubelet/metrics"
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 	kubeutil "k8s.io/kubernetes/pkg/kubelet/util"
+	"k8s.io/kubernetes/pkg/kubelet/util/format"
 	statusutil "k8s.io/kubernetes/pkg/util/pod"
 )
 
@@ -1344,3 +1345,73 @@ func (m *manager) recordInProgressResizeCount() {
 	}
 	metrics.PodInProgressResizes.Set(float64(inProgressResizeCount))
 }
+
+// SidecarsStatus contains three bools, whether the pod has sidecars,
+// if the all the sidecars are ready and if the non sidecars are in a
+// waiting state.
+type SidecarsStatus struct {
+	SidecarsPresent   bool
+	SidecarsReady     bool
+	ContainersWaiting bool
+}
+
+// GetSidecarsStatus returns the SidecarsStatus for the given pod
+// We assume the worst: if we are unable to determine the status of all containers we make defensive assumptions that
+// there are sidecars, they are not ready, and that there are non-sidecars waiting. This is to prevent starting non-
+// -sidecars accidentally.
+func GetSidecarsStatus(pod *v1.Pod) SidecarsStatus {
+	var containerStatusesCopy []v1.ContainerStatus
+	if pod == nil {
+		klog.Infof("Pod was nil, returning sidecar status that prevents progress")
+		return SidecarsStatus{SidecarsPresent: true, SidecarsReady: false, ContainersWaiting: true}
+	}
+	if pod.Spec.Containers == nil {
+		klog.Infof("Pod %s: Containers was nil, returning sidecar status that prevents progress", format.Pod(pod))
+		return SidecarsStatus{SidecarsPresent: true, SidecarsReady: false, ContainersWaiting: true}
+	}
+	if pod.Status.ContainerStatuses == nil {
+		klog.Infof("Pod %s: ContainerStatuses was nil, doing best effort using spec", format.Pod(pod))
+	} else {
+		// Make a copy of ContainerStatuses to avoid having the carpet pulled from under our feet
+		containerStatusesCopy = make([]v1.ContainerStatus, len(pod.Status.ContainerStatuses))
+		copy(containerStatusesCopy, pod.Status.ContainerStatuses)
+	}
+
+	sidecarsStatus := SidecarsStatus{SidecarsPresent: false, SidecarsReady: true, ContainersWaiting: false}
+	for _, container := range pod.Spec.Containers {
+		foundStatus := false
+		isSidecar := false
+		if pod.Annotations[fmt.Sprintf("sidecars.lyft.net/container-lifecycle-%s", container.Name)] == "Sidecar" {
+			isSidecar = true
+			sidecarsStatus.SidecarsPresent = true
+		}
+		for _, status := range containerStatusesCopy {
+			if status.Name == container.Name {
+				foundStatus = true
+				if isSidecar {
+					if !status.Ready {
+						klog.Infof("Pod %s: %s: sidecar not ready", format.Pod(pod), container.Name)
+						sidecarsStatus.SidecarsReady = false
+					} else {
+						klog.Infof("Pod %s: %s: sidecar is ready", format.Pod(pod), container.Name)
+					}
+				} else if status.State.Waiting != nil {
+					// check if non-sidecars have started
+					klog.Infof("Pod: %s: %s: non-sidecar waiting", format.Pod(pod), container.Name)
+					sidecarsStatus.ContainersWaiting = true
+				}
+				break
+			}
+		}
+		if !foundStatus {
+			if isSidecar {
+				klog.Infof("Pod %s: %s (sidecar): status not found, assuming not ready", format.Pod(pod), container.Name)
+				sidecarsStatus.SidecarsReady = false
+			} else {
+				klog.Infof("Pod: %s: %s (non-sidecar): status not found, assuming waiting", format.Pod(pod), container.Name)
+				sidecarsStatus.ContainersWaiting = true
+			}
+		}
+	}
+	return sidecarsStatus
+}
diff --git a/test/e2e/node/pods.go b/test/e2e/node/pods.go
index dd8528bcf741f..51330d103eb5d 100644
--- a/test/e2e/node/pods.go
+++ b/test/e2e/node/pods.go
@@ -32,23 +32,17 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/watch"
 	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
-	"k8s.io/client-go/util/retry"
 	"k8s.io/kubernetes/pkg/features"
-	"k8s.io/kubernetes/pkg/kubelet/events"
-	"k8s.io/kubernetes/test/e2e/feature"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
-	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	imageutils "k8s.io/kubernetes/test/utils/image"
 	admissionapi "k8s.io/pod-security-admission/api"
-	"k8s.io/utils/ptr"
 
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
@@ -228,494 +222,127 @@ var _ = SIGDescribe("Pods Extended", func() {
 		})
 	})
 
-	ginkgo.Describe("Pod Container lifecycle", func() {
-		var podClient *e2epod.PodClient
-		ginkgo.BeforeEach(func() {
-			podClient = e2epod.NewPodClient(f)
-		})
-
-		ginkgo.It("should not create extra sandbox if all containers are done", func(ctx context.Context) {
-			ginkgo.By("creating the pod that should always exit 0")
-
-			name := "pod-always-succeed" + string(uuid.NewUUID())
-			image := imageutils.GetE2EImage(imageutils.BusyBox)
-			pod := &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name: name,
-				},
-				Spec: v1.PodSpec{
-					RestartPolicy: v1.RestartPolicyOnFailure,
-					InitContainers: []v1.Container{
-						{
-							Name:  "foo",
-							Image: image,
-							Command: []string{
-								"/bin/true",
-							},
-						},
-					},
-					Containers: []v1.Container{
-						{
-							Name:  "bar",
-							Image: image,
-							Command: []string{
-								"/bin/true",
-							},
-						},
-					},
-				},
-			}
-
-			ginkgo.By("submitting the pod to kubernetes")
-			createdPod := podClient.Create(ctx, pod)
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
+	/*
+		// For now the a2a9964a66ef481d10db3ffc15d4b26a234d0bdd fix isn't compatible with
+		// the sidecar patchset we keep in this fork. Comment out related tests.
+		ginkgo.Describe("Pod Container lifecycle", func() {
+			var podClient *e2epod.PodClient
+			ginkgo.BeforeEach(func() {
+				podClient = e2epod.NewPodClient(f)
 			})
 
-			framework.ExpectNoError(e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name))
-
-			var eventList *v1.EventList
-			var err error
-			ginkgo.By("Getting events about the pod")
-			framework.ExpectNoError(wait.Poll(time.Second*2, time.Second*60, func() (bool, error) {
-				selector := fields.Set{
-					"involvedObject.kind":      "Pod",
-					"involvedObject.uid":       string(createdPod.UID),
-					"involvedObject.namespace": f.Namespace.Name,
-					"source":                   "kubelet",
-				}.AsSelector().String()
-				options := metav1.ListOptions{FieldSelector: selector}
-				eventList, err = f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, options)
-				if err != nil {
-					return false, err
-				}
-				if len(eventList.Items) > 0 {
-					return true, nil
-				}
-				return false, nil
-			}))
+			ginkgo.It("should not create extra sandbox if all containers are done", func() {
+				ginkgo.By("creating the pod that should always exit 0")
 
-			ginkgo.By("Checking events about the pod")
-			for _, event := range eventList.Items {
-				if event.Reason == events.SandboxChanged {
-					framework.Fail("Unexpected SandboxChanged event")
-				}
-			}
-		})
-
-		ginkgo.It("evicted pods should be terminal", func(ctx context.Context) {
-			ginkgo.By("creating the pod that should be evicted")
-
-			name := "pod-should-be-evicted" + string(uuid.NewUUID())
-			image := imageutils.GetE2EImage(imageutils.BusyBox)
-			pod := &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name: name,
-				},
-				Spec: v1.PodSpec{
-					RestartPolicy: v1.RestartPolicyOnFailure,
-					Containers: []v1.Container{
-						{
-							Name:  "bar",
-							Image: image,
-							Command: []string{
-								"/bin/sh", "-c", "sleep 10; dd if=/dev/zero of=file bs=1M count=10; sleep 10000",
-							},
-							Resources: v1.ResourceRequirements{
-								Limits: v1.ResourceList{
-									"ephemeral-storage": resource.MustParse("5Mi"),
-								},
-							}},
+				name := "pod-always-succeed" + string(uuid.NewUUID())
+				image := imageutils.GetE2EImage(imageutils.BusyBox)
+				pod := &v1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: name,
 					},
-				},
-			}
-
-			ginkgo.By("submitting the pod to kubernetes")
-			podClient.Create(ctx, pod)
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
-			})
-
-			err := e2epod.WaitForPodTerminatedInNamespace(ctx, f.ClientSet, pod.Name, "Evicted", f.Namespace.Name)
-			if err != nil {
-				framework.Failf("error waiting for pod to be evicted: %v", err)
-			}
-
-		})
-	})
-
-	ginkgo.Describe("Pod TerminationGracePeriodSeconds is negative", func() {
-		var podClient *e2epod.PodClient
-		ginkgo.BeforeEach(func() {
-			podClient = e2epod.NewPodClient(f)
-		})
-
-		ginkgo.It("pod with negative grace period", func(ctx context.Context) {
-			name := "pod-negative-grace-period" + string(uuid.NewUUID())
-			image := imageutils.GetE2EImage(imageutils.BusyBox)
-			pod := &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name: name,
-				},
-				Spec: v1.PodSpec{
-					RestartPolicy: v1.RestartPolicyOnFailure,
-					Containers: []v1.Container{
-						{
-							Name:  "foo",
-							Image: image,
-							Command: []string{
-								"/bin/sh", "-c", "sleep 10000",
+					Spec: v1.PodSpec{
+						RestartPolicy: v1.RestartPolicyOnFailure,
+						InitContainers: []v1.Container{
+							{
+								Name:  "foo",
+								Image: image,
+								Command: []string{
+									"/bin/true",
+								},
 							},
 						},
-					},
-					TerminationGracePeriodSeconds: ptr.To[int64](-1),
-				},
-			}
-
-			ginkgo.By("submitting the pod to kubernetes")
-			podClient.Create(ctx, pod)
-
-			pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "failed to query for pod")
-
-			if pod.Spec.TerminationGracePeriodSeconds == nil {
-				framework.Failf("pod spec TerminationGracePeriodSeconds is nil")
-			}
-
-			if *pod.Spec.TerminationGracePeriodSeconds != 1 {
-				framework.Failf("pod spec TerminationGracePeriodSeconds is not 1: %d", *pod.Spec.TerminationGracePeriodSeconds)
-			}
-
-			// retry if the TerminationGracePeriodSeconds is overrided
-			// see more in https://github.com/kubernetes/kubernetes/pull/115606
-			err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
-				pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-				framework.ExpectNoError(err, "failed to query for pod")
-				ginkgo.By("updating the pod to have a negative TerminationGracePeriodSeconds")
-				pod.Spec.TerminationGracePeriodSeconds = ptr.To[int64](-1)
-				_, err = podClient.PodInterface.Update(ctx, pod, metav1.UpdateOptions{})
-				return err
-			})
-			framework.ExpectNoError(err, "failed to update pod")
-
-			pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "failed to query for pod")
-
-			if pod.Spec.TerminationGracePeriodSeconds == nil {
-				framework.Failf("pod spec TerminationGracePeriodSeconds is nil")
-			}
-
-			if *pod.Spec.TerminationGracePeriodSeconds != 1 {
-				framework.Failf("pod spec TerminationGracePeriodSeconds is not 1: %d", *pod.Spec.TerminationGracePeriodSeconds)
-			}
-
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
-			})
-		})
-	})
-})
-
-var _ = SIGDescribe("Pods Extended (pod generation)", feature.PodObservedGenerationTracking, framework.WithFeatureGate(features.PodObservedGenerationTracking), func() {
-	f := framework.NewDefaultFramework("pods")
-	f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
-
-	ginkgo.Describe("Pod Generation", func() {
-		var podClient *e2epod.PodClient
-		ginkgo.BeforeEach(func() {
-			podClient = e2epod.NewPodClient(f)
-		})
-
-		ginkgo.It("pod generation should start at 1 and increment per update", func(ctx context.Context) {
-			ginkgo.By("creating the pod")
-			podName := "pod-generation-" + string(uuid.NewUUID())
-			pod := e2epod.NewAgnhostPod(f.Namespace.Name, podName, nil, nil, nil)
-			pod.Spec.InitContainers = []v1.Container{{
-				Name:  "init-container",
-				Image: imageutils.GetE2EImage(imageutils.BusyBox),
-			}}
-
-			ginkgo.By("submitting the pod to kubernetes")
-			pod = podClient.CreateSync(ctx, pod)
-			gomega.Expect(pod.Generation).To(gomega.BeEquivalentTo(1))
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
-			})
-
-			ginkgo.By("verifying pod generation bumps as expected")
-			tests := []struct {
-				name                 string
-				updateFn             func(*v1.Pod)
-				expectGenerationBump bool
-			}{
-				{
-					name:                 "empty update",
-					updateFn:             func(pod *v1.Pod) {},
-					expectGenerationBump: false,
-				},
-
-				{
-					name: "updating Tolerations to trigger generation bump",
-					updateFn: func(pod *v1.Pod) {
-						pod.Spec.Tolerations = []v1.Toleration{
+						Containers: []v1.Container{
 							{
-								Key:      "foo-" + string(uuid.NewUUID()),
-								Operator: v1.TolerationOpEqual,
-								Value:    "bar",
-								Effect:   v1.TaintEffectNoSchedule,
+								Name:  "bar",
+								Image: image,
+								Command: []string{
+									"/bin/true",
+								},
 							},
-						}
-					},
-					expectGenerationBump: true,
-				},
-
-				{
-					name: "updating ActiveDeadlineSeconds to trigger generation bump",
-					updateFn: func(pod *v1.Pod) {
-						int5000 := int64(5000)
-						pod.Spec.ActiveDeadlineSeconds = &int5000
-					},
-					expectGenerationBump: true,
-				},
-
-				{
-					name: "updating container image to trigger generation bump",
-					updateFn: func(pod *v1.Pod) {
-						pod.Spec.Containers[0].Image = imageutils.GetE2EImage(imageutils.Nginx)
-					},
-					expectGenerationBump: true,
-				},
-
-				{
-					name: "updating initContainer image to trigger generation bump",
-					updateFn: func(pod *v1.Pod) {
-						pod.Spec.InitContainers[0].Image = imageutils.GetE2EImage(imageutils.Pause)
-					},
-					expectGenerationBump: true,
-				},
-
-				{
-					name: "updates to pod metadata should not trigger generation bump",
-					updateFn: func(pod *v1.Pod) {
-						pod.SetAnnotations(map[string]string{"key": "value"})
-					},
-					expectGenerationBump: false,
-				},
-
-				{
-					name: "pod generation updated by client should be ignored",
-					updateFn: func(pod *v1.Pod) {
-						pod.SetGeneration(1)
+						},
 					},
-					expectGenerationBump: false,
-				},
-			}
-
-			expectedPodGeneration := int64(1)
-			for _, test := range tests {
-				ginkgo.By(test.name)
-				podClient.Update(ctx, podName, test.updateFn)
-				pod, err := podClient.Get(ctx, podName, metav1.GetOptions{})
-				framework.ExpectNoError(err, "failed to query for pod")
-				if test.expectGenerationBump {
-					expectedPodGeneration++
 				}
-				gomega.Expect(pod.Generation).To(gomega.BeEquivalentTo(expectedPodGeneration))
-				framework.ExpectNoError(e2epod.WaitForPodObservedGeneration(ctx, f.ClientSet, f.Namespace.Name, pod.Name, expectedPodGeneration, 20*time.Second))
-			}
-		})
 
-		ginkgo.It("custom-set generation on new pods and graceful delete", func(ctx context.Context) {
-			ginkgo.By("creating the pod")
-			name := "pod-generation-" + string(uuid.NewUUID())
-			value := strconv.Itoa(time.Now().Nanosecond())
-			pod := e2epod.NewAgnhostPod(f.Namespace.Name, name, nil, nil, nil)
-			pod.ObjectMeta.Labels = map[string]string{
-				"time": value,
-			}
-			pod.SetGeneration(100)
-
-			ginkgo.By("submitting the pod to kubernetes")
-			pod = podClient.CreateSync(ctx, pod)
-
-			ginkgo.By("verifying the new pod's generation is 1")
-			gomega.Expect(pod.Generation).To(gomega.BeEquivalentTo(1))
-
-			ginkgo.By("issue a graceful delete to trigger generation bump")
-			// We need to wait for the pod to be running, otherwise the deletion
-			// may be carried out immediately rather than gracefully.
-			framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name))
-			pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "failed to GET scheduled pod")
-
-			var lastPod v1.Pod
-			var statusCode int
-			// Set gracePeriodSeconds to 60 to give us time to verify the generation bump.
-			err = f.ClientSet.CoreV1().RESTClient().Delete().AbsPath("/api/v1/namespaces", pod.Namespace, "pods", pod.Name).Param("gracePeriodSeconds", "60").Do(ctx).StatusCode(&statusCode).Into(&lastPod)
-			framework.ExpectNoError(err, "failed to use http client to send delete")
-			gomega.Expect(statusCode).To(gomega.Equal(http.StatusOK), "failed to delete gracefully by client request")
-
-			ginkgo.By("verifying the pod generation was bumped")
-			pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "failed to query for pod")
-			gomega.Expect(pod.Generation).To(gomega.BeEquivalentTo(2))
-		})
+				ginkgo.By("submitting the pod to kubernetes")
+				createdPod := podClient.Create(pod)
+				defer func() {
+					ginkgo.By("deleting the pod")
+					podClient.Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
+				}()
 
-		ginkgo.It("issue 500 podspec updates and verify generation and observedGeneration eventually converge", func(ctx context.Context) {
-			ginkgo.By("creating the pod")
-			name := "pod-generation-" + string(uuid.NewUUID())
-			value := strconv.Itoa(time.Now().Nanosecond())
-			pod := e2epod.NewAgnhostPod(f.Namespace.Name, name, nil, nil, nil)
-			pod.ObjectMeta.Labels = map[string]string{
-				"time": value,
-			}
-			pod.Spec.ActiveDeadlineSeconds = ptr.To[int64](5000)
+				framework.ExpectNoError(e2epod.WaitForPodSuccessInNamespace(f.ClientSet, pod.Name, f.Namespace.Name))
+
+				var eventList *v1.EventList
+				var err error
+				ginkgo.By("Getting events about the pod")
+				framework.ExpectNoError(wait.Poll(time.Second*2, time.Second*60, func() (bool, error) {
+					selector := fields.Set{
+						"involvedObject.kind":      "Pod",
+						"involvedObject.uid":       string(createdPod.UID),
+						"involvedObject.namespace": f.Namespace.Name,
+						"source":                   "kubelet",
+					}.AsSelector().String()
+					options := metav1.ListOptions{FieldSelector: selector}
+					eventList, err = f.ClientSet.CoreV1().Events(f.Namespace.Name).List(context.TODO(), options)
+					if err != nil {
+						return false, err
+					}
+					if len(eventList.Items) > 0 {
+						return true, nil
+					}
+					return false, nil
+				}))
 
-			ginkgo.By("submitting the pod to kubernetes")
-			pod = podClient.CreateSync(ctx, pod)
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
+				ginkgo.By("Checking events about the pod")
+				for _, event := range eventList.Items {
+					if event.Reason == events.SandboxChanged {
+						framework.Fail("Unexpected SandboxChanged event")
+					}
+				}
 			})
 
-			for i := 0; i < 499; i++ {
-				podClient.Update(ctx, pod.Name, func(pod *v1.Pod) {
-					*pod.Spec.ActiveDeadlineSeconds--
-				})
-			}
+			ginkgo.It("evicted pods should be terminal", func() {
+				ginkgo.By("creating the pod that should be evicted")
 
-			// Verify pod observedGeneration converges to the expected generation.
-			expectedPodGeneration := int64(500)
-			framework.ExpectNoError(e2epod.WaitForPodObservedGeneration(ctx, f.ClientSet, f.Namespace.Name, pod.Name, expectedPodGeneration, 30*time.Second))
-
-			// Verify pod generation converges to the expected generation.
-			pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "failed to query for pod")
-			gomega.Expect(pod.Generation).To(gomega.BeEquivalentTo(expectedPodGeneration))
-		})
-
-		// This is the same test as https://github.com/kubernetes/kubernetes/blob/aa08c90fca8d30038d3f05c0e8f127b540b40289/test/e2e/node/pod_admission.go#L35,
-		// except that this verifies the pod generation and observedGeneration, which is
-		// currently behind a feature gate. When we GA observedGeneration functionality,
-		// we can fold these tests together into one.
-		ginkgo.It("pod rejected by kubelet should have updated generation and observedGeneration", func(ctx context.Context) {
-			node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
-			framework.ExpectNoError(err, "Failed to get a ready schedulable node")
-
-			// Create a pod that requests more CPU than the node has.
-			pod := &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "pod-out-of-cpu",
-					Namespace: f.Namespace.Name,
-				},
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{
-						{
-							Name:  "pod-out-of-cpu",
-							Image: imageutils.GetPauseImageName(),
-							Resources: v1.ResourceRequirements{
-								Requests: v1.ResourceList{
-									v1.ResourceCPU: resource.MustParse("1000000000000"), // requests more CPU than any node has
+				name := "pod-should-be-evicted" + string(uuid.NewUUID())
+				image := imageutils.GetE2EImage(imageutils.BusyBox)
+				pod := &v1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: name,
+					},
+					Spec: v1.PodSpec{
+						RestartPolicy: v1.RestartPolicyOnFailure,
+						Containers: []v1.Container{
+							{
+								Name:  "bar",
+								Image: image,
+								Command: []string{
+									"/bin/sh", "-c", "sleep 10; dd if=/dev/zero of=file bs=1M count=10; sleep 10000",
 								},
-							},
+								Resources: v1.ResourceRequirements{
+									Limits: v1.ResourceList{
+										"ephemeral-storage": resource.MustParse("5Mi"),
+									},
+								}},
 						},
 					},
-				},
-			}
-
-			ginkgo.By("submitting the pod to kubernetes")
-			pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod, metav1.CreateOptions{})
-			framework.ExpectNoError(err)
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
-			})
-
-			// Wait for the scheduler to update the pod status
-			err = e2epod.WaitForPodNameUnschedulableInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace)
-			framework.ExpectNoError(err)
-
-			// Fetch the pod to verify that the scheduler has set the PodScheduled condition
-			// with observedGeneration.
-			pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err)
-			gomega.Expect(len(pod.Status.Conditions)).To(gomega.BeEquivalentTo(1))
-			gomega.Expect(pod.Status.Conditions[0].Type).To(gomega.BeEquivalentTo(v1.PodScheduled))
-			gomega.Expect(pod.Status.Conditions[0].ObservedGeneration).To(gomega.BeEquivalentTo(1))
-
-			// Force assign the Pod to a node in order to get rejection status.
-			binding := &v1.Binding{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      pod.Name,
-					Namespace: pod.Namespace,
-					UID:       pod.UID,
-				},
-				Target: v1.ObjectReference{
-					Kind: "Node",
-					Name: node.Name,
-				},
-			}
-			framework.ExpectNoError(f.ClientSet.CoreV1().Pods(pod.Namespace).Bind(ctx, binding, metav1.CreateOptions{}))
-
-			// Kubelet has rejected the pod.
-			err = e2epod.WaitForPodFailedReason(ctx, f.ClientSet, pod, "OutOfcpu", f.Timeouts.PodStart)
-			framework.ExpectNoError(err)
-
-			// Fetch the rejected Pod and verify the generation and observedGeneration.
-			gotPod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err)
-			gomega.Expect(gotPod.Generation).To(gomega.BeEquivalentTo(1))
-			gomega.Expect(gotPod.Status.ObservedGeneration).To(gomega.BeEquivalentTo(1))
-		})
-
-		ginkgo.It("pod observedGeneration field set in pod conditions", func(ctx context.Context) {
-			ginkgo.By("creating the pod")
-			name := "pod-generation-" + string(uuid.NewUUID())
-			pod := e2epod.NewAgnhostPod(f.Namespace.Name, name, nil, nil, nil)
-
-			// Set the pod image to something that doesn't exist to induce a pull error
-			// to start with.
-			agnImage := pod.Spec.Containers[0].Image
-			pod.Spec.Containers[0].Image = "some-image-that-doesnt-exist"
-
-			ginkgo.By("submitting the pod to kubernetes")
-			pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod, metav1.CreateOptions{})
-			framework.ExpectNoError(err)
-			ginkgo.DeferCleanup(func(ctx context.Context) error {
-				ginkgo.By("deleting the pod")
-				return podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{})
-			})
+				}
 
-			expectedPodConditions := []v1.PodConditionType{
-				v1.PodReadyToStartContainers,
-				v1.PodInitialized,
-				v1.PodReady,
-				v1.ContainersReady,
-				v1.PodScheduled,
-			}
+				ginkgo.By("submitting the pod to kubernetes")
+				podClient.Create(pod)
+				defer func() {
+					ginkgo.By("deleting the pod")
+					podClient.Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
+				}()
 
-			ginkgo.By("verifying the pod conditions have observedGeneration values")
-			expectedObservedGeneration := int64(1)
-			for _, condition := range expectedPodConditions {
-				framework.ExpectNoError(e2epod.WaitForPodConditionObservedGeneration(ctx, f.ClientSet, f.Namespace.Name, pod.Name, condition, expectedObservedGeneration, 30*time.Second))
-			}
+				err := e2epod.WaitForPodTerminatedInNamespace(f.ClientSet, pod.Name, "Evicted", f.Namespace.Name)
+				if err != nil {
+					framework.Failf("error waiting for pod to be evicted: %v", err)
+				}
 
-			ginkgo.By("updating pod to have a valid image")
-			podClient.Update(ctx, pod.Name, func(pod *v1.Pod) {
-				pod.Spec.Containers[0].Image = agnImage
 			})
-			expectedObservedGeneration++
-
-			ginkgo.By("verifying the pod conditions have updated observedGeneration values")
-			for _, condition := range expectedPodConditions {
-				framework.ExpectNoError(e2epod.WaitForPodConditionObservedGeneration(ctx, f.ClientSet, f.Namespace.Name, pod.Name, condition, expectedObservedGeneration, 30*time.Second))
-			}
 		})
-	})
+	*/
 })
 
 var _ = SIGDescribe("Pod Extended (container restart policy)", framework.WithFeatureGate(features.ContainerRestartRules), func() {

From 7db549285a2810864b8ecdba5302418d7b82d0a7 Mon Sep 17 00:00:00 2001
From: Ayaz Badouraly <badouralix@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:09:57 +0100
Subject: [PATCH 3/6] flowcontrol: currentCL can reach maxCL (#87)

datadog:patch
---
 .../pkg/util/flowcontrol/apf_controller.go    | 35 ++++++++---
 .../util/flowcontrol/exempt_borrowing_test.go | 63 +++++++++++++++++++
 2 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_controller.go b/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_controller.go
index 43765c34252b1..49bd69e153395 100644
--- a/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_controller.go
+++ b/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/apf_controller.go
@@ -412,8 +412,8 @@ func (cfgCtlr *configController) updateBorrowingLocked(setCompleters bool, plSta
 			cclOfExempt[plName] = minCurrentCL
 			remainingServerCL -= minCurrentCL
 		} else {
-			// Lower bound on this priority level's adjusted concurreny limit is the lesser of:
-			// - its seat demamd high watermark over the last adjustment period, and
+			// Lower bound on this priority level's adjusted concurrency limit is the lesser of:
+			// - its seat demand high watermark over the last adjustment period, and
 			// - its configured concurrency limit.
 			// BUT: we do not want this to be lower than the lower bound from configuration.
 			// See KEP-1040 for a more detailed explanation.
@@ -552,7 +552,7 @@ func (cfgCtlr *configController) syncOne() (specificDelay time.Duration, err err
 // cope with the various dependencies between objects.  The process of
 // digestion is done in four passes over config objects --- three
 // passes over PriorityLevelConfigurations and one pass over the
-// FlowSchemas --- with the work dvided among the passes according to
+// FlowSchemas --- with the work divided among the passes according to
 // those dependencies.
 type cfgMeal struct {
 	cfgCtlr *configController
@@ -563,6 +563,10 @@ type cfgMeal struct {
 	// new configuration
 	shareSum float64
 
+	// The sum of the lendable concurrency shares of the priority
+	// levels in the new configuration
+	lendableShareSum float64
+
 	// These keep track of which mandatory priority level config
 	// objects have been digested
 	haveExemptPL, haveCatchAllPL bool
@@ -700,7 +704,7 @@ func (cfgCtlr *configController) lockAndDigestConfigObjects(newPLs []*flowcontro
 
 	// The new config has been constructed
 	cfgCtlr.priorityLevelStates = meal.newPLStates
-	klog.V(5).InfoS("Switched to new API Priority and Fairness configuration", "maxWaitingRequests", meal.maxWaitingRequests, "maxExecutinRequests", meal.maxExecutingRequests)
+	klog.V(5).InfoS("Switched to new API Priority and Fairness configuration", "maxWaitingRequests", meal.maxWaitingRequests, "maxExecutingRequests", meal.maxExecutingRequests)
 
 	metrics.GetWaitingReadonlyConcurrency().SetDenominator(float64(meal.maxWaitingRequests))
 	metrics.GetWaitingMutatingConcurrency().SetDenominator(float64(meal.maxWaitingRequests))
@@ -738,8 +742,11 @@ func (meal *cfgMeal) digestNewPLsLocked(newPLs []*flowcontrol.PriorityLevelConfi
 			klog.V(3).Infof("Priority level %q was undesired and has become desired again", pl.Name)
 			state.quiescing = false
 		}
-		nominalConcurrencyShares, _, _ := plSpecCommons(state.pl)
+		nominalConcurrencyShares, lendablePercent, _ := plSpecCommons(state.pl)
 		meal.shareSum += float64(*nominalConcurrencyShares)
+		if lendablePercent != nil {
+			meal.lendableShareSum += float64(*nominalConcurrencyShares) * float64(*lendablePercent) / 100
+		}
 		meal.haveExemptPL = meal.haveExemptPL || pl.Name == flowcontrol.PriorityLevelConfigurationNameExempt
 		meal.haveCatchAllPL = meal.haveCatchAllPL || pl.Name == flowcontrol.PriorityLevelConfigurationNameCatchAll
 	}
@@ -843,8 +850,11 @@ func (meal *cfgMeal) processOldPLsLocked() {
 		// priority level continues to get a concurrency
 		// allocation determined by all the share values in the
 		// regular way.
-		nominalConcurrencyShares, _, _ := plSpecCommons(plState.pl)
+		nominalConcurrencyShares, lendablePercent, _ := plSpecCommons(plState.pl)
 		meal.shareSum += float64(*nominalConcurrencyShares)
+		if lendablePercent != nil {
+			meal.lendableShareSum += float64(*nominalConcurrencyShares) * float64(*lendablePercent) / 100
+		}
 		meal.haveExemptPL = meal.haveExemptPL || plName == flowcontrol.PriorityLevelConfigurationNameExempt
 		meal.haveCatchAllPL = meal.haveCatchAllPL || plName == flowcontrol.PriorityLevelConfigurationNameCatchAll
 		meal.newPLStates[plName] = plState
@@ -865,10 +875,17 @@ func (meal *cfgMeal) finishQueueSetReconfigsLocked() {
 		if lendablePercent != nil {
 			lendableCL = int(math.Round(float64(concurrencyLimit) * float64(*lendablePercent) / 100))
 		}
+		// By default, the borrowing concurrency limit is set such
+		// that it does not exceed the total number of lendable seats
+		// and the max concurrency limit does not exceed the server
+		// concurrency limit. If borrowingLimitPercent is defined,
+		// the borrowing concurrency limit is set accordingly.
+		borrowingCL = meal.cfgCtlr.serverConcurrencyLimit - concurrencyLimit
+		if plState.pl.Spec.Type == flowcontrol.PriorityLevelEnablementLimited {
+			borrowingCL = min(borrowingCL, int(math.Ceil(float64(meal.cfgCtlr.serverConcurrencyLimit)*meal.lendableShareSum/meal.shareSum))-lendableCL)
+		}
 		if borrowingLimitPercent != nil {
-			borrowingCL = int(math.Round(float64(concurrencyLimit) * float64(*borrowingLimitPercent) / 100))
-		} else {
-			borrowingCL = meal.cfgCtlr.serverConcurrencyLimit
+			borrowingCL = min(borrowingCL, int(math.Round(float64(concurrencyLimit)*float64(*borrowingLimitPercent)/100)))
 		}
 
 		metrics.SetPriorityLevelConfiguration(plName, concurrencyLimit, concurrencyLimit-lendableCL, concurrencyLimit+borrowingCL)
diff --git a/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/exempt_borrowing_test.go b/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/exempt_borrowing_test.go
index 308c972f7cbfa..4845a8c3c6c2f 100644
--- a/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/exempt_borrowing_test.go
+++ b/staging/src/k8s.io/apiserver/pkg/util/flowcontrol/exempt_borrowing_test.go
@@ -162,3 +162,66 @@ func TestUpdateBorrowing(t *testing.T) {
 	}
 
 }
+
+func TestMaxCL(t *testing.T) {
+	startTime := time.Now()
+	clk, _ := testeventclock.NewFake(startTime, 0, nil)
+	plcExempt := fcboot.MandatoryPriorityLevelConfigurationExempt
+	plcHigh := fcboot.SuggestedPriorityLevelConfigurationWorkloadHigh
+	plcMid := fcboot.SuggestedPriorityLevelConfigurationWorkloadLow
+	plcLow := fcboot.MandatoryPriorityLevelConfigurationCatchAll
+	plcs := []*flowcontrol.PriorityLevelConfiguration{plcHigh, plcExempt, plcMid, plcLow}
+	fses := []*flowcontrol.FlowSchema{}
+	k8sClient := clientsetfake.NewSimpleClientset(plcLow, plcExempt, plcHigh, plcMid)
+	informerFactory := informers.NewSharedInformerFactory(k8sClient, 0)
+	flowcontrolClient := k8sClient.FlowcontrolV1()
+	serverCL := int(*plcHigh.Spec.Limited.NominalConcurrencyShares+
+		*plcMid.Spec.Limited.NominalConcurrencyShares+
+		*plcLow.Spec.Limited.NominalConcurrencyShares) * 6
+	config := TestableConfig{
+		Name:                   "test",
+		Clock:                  clk,
+		AsFieldManager:         "testfm",
+		FoundToDangling:        func(found bool) bool { return !found },
+		InformerFactory:        informerFactory,
+		FlowcontrolClient:      flowcontrolClient,
+		ServerConcurrencyLimit: serverCL,
+		ReqsGaugeVec:           metrics.PriorityLevelConcurrencyGaugeVec,
+		ExecSeatsGaugeVec:      metrics.PriorityLevelExecutionSeatsGaugeVec,
+		QueueSetFactory:        fqs.NewQueueSetFactory(clk),
+	}
+	ctlr := newTestableController(config)
+	_ = ctlr.lockAndDigestConfigObjects(plcs, fses)
+	if ctlr.nominalCLSum != serverCL {
+		t.Fatalf("Unexpected rounding: nominalCLSum=%d", ctlr.nominalCLSum)
+	}
+	stateExempt := ctlr.priorityLevelStates[plcExempt.Name]
+	stateHigh := ctlr.priorityLevelStates[plcHigh.Name]
+	stateMid := ctlr.priorityLevelStates[plcMid.Name]
+	stateLow := ctlr.priorityLevelStates[plcLow.Name]
+
+	expectedExempt := serverCL
+	expectedHigh := stateHigh.nominalCL + int(float64(stateMid.nominalCL)*float64(*stateMid.pl.Spec.Limited.LendablePercent)/100) + int(float64(stateLow.nominalCL)*float64(*stateLow.pl.Spec.Limited.LendablePercent)/100)
+	expectedMid := int(float64(stateHigh.nominalCL)*float64(*stateHigh.pl.Spec.Limited.LendablePercent)/100) + stateMid.nominalCL + int(float64(stateLow.nominalCL)*float64(*stateLow.pl.Spec.Limited.LendablePercent)/100)
+	expectedLow := int(float64(stateHigh.nominalCL)*float64(*stateHigh.pl.Spec.Limited.LendablePercent)/100) + int(float64(stateMid.nominalCL)*float64(*stateMid.pl.Spec.Limited.LendablePercent)/100) + stateLow.nominalCL
+	if expected, actual := expectedExempt, stateExempt.maxCL; expected != actual {
+		t.Errorf("MaxCL: expected %d, got %d for exempt", expected, actual)
+	} else {
+		t.Logf("MaxCL: expected and got %d for exempt", expected)
+	}
+	if expected, actual := expectedHigh, stateHigh.maxCL; expected != actual {
+		t.Errorf("MaxCL: expected %d, got %d for hi", expected, actual)
+	} else {
+		t.Logf("MaxCL: expected and got %d for hi", expected)
+	}
+	if expected, actual := expectedMid, stateMid.maxCL; expected != actual {
+		t.Errorf("MaxCL: expected %d, got %d for mid", expected, actual)
+	} else {
+		t.Logf("MaxCL: expected and got %d for mid", expected)
+	}
+	if expected, actual := expectedLow, stateLow.maxCL; expected != actual {
+		t.Errorf("MaxCL: expected %d, got %d for lo", expected, actual)
+	} else {
+		t.Logf("MaxCL: expected and got %d for lo", expected)
+	}
+}

From c7cea249a45fbafcc265fd3d78c51d87f4b2db72 Mon Sep 17 00:00:00 2001
From: bob <bob.bouteillier@datadoghq.com>
Date: Fri, 20 Sep 2024 21:27:24 +0200
Subject: [PATCH 4/6] [fips] - Enable boringcrypto in the build image.

This should allow build to have fips crypto enabled when we build with CGO_ENABLED=1

Modify the github build to do so.

datadog:patch

[fips] - Enable fips on select component

This would enable *fipsonly* on the following component:
- kubelet
- kubectl
- kube-controller-manager
- kube-scheduler

The apiserver is currently being held back because not all client is going to be fips.
Especially in non govcloud environment.

datadog:patch
---
 .github/workflows/dd-build.yml      | 5 +++--
 build/build-image/Dockerfile        | 5 +++++
 cmd/kube-apiserver/fips.go          | 6 ++++++
 cmd/kube-controller-manager/fips.go | 6 ++++++
 cmd/kube-scheduler/fips.go          | 6 ++++++
 cmd/kubectl/fips.go                 | 6 ++++++
 cmd/kubelet/fips.go                 | 6 ++++++
 7 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 cmd/kube-apiserver/fips.go
 create mode 100644 cmd/kube-controller-manager/fips.go
 create mode 100644 cmd/kube-scheduler/fips.go
 create mode 100644 cmd/kubectl/fips.go
 create mode 100644 cmd/kubelet/fips.go

diff --git a/.github/workflows/dd-build.yml b/.github/workflows/dd-build.yml
index e3d1d51d8b146..de1703d754c53 100644
--- a/.github/workflows/dd-build.yml
+++ b/.github/workflows/dd-build.yml
@@ -20,7 +20,7 @@ jobs:
     - name: Set up Go
       uses: actions/setup-go@v5
       with:
-        go-version: 1.22
+        go-version: 1.24
     - name: Set env
       run: echo SANITIZED_TARGET_PLATFORM=${KUBE_BUILD_PLATFORM/\//-} >> $GITHUB_ENV
       env:
@@ -34,9 +34,10 @@ jobs:
         sudo rm -rf /usr/local/.ghcup
     - name: Build
       env:
+        GOFLAGS: "-tags=fips"
         KUBE_BUILD_PLATFORMS: ${{ matrix.platform }}
         KUBE_RELEASE_RUN_TESTS: n
-      run: make quick-release KUBE_BUILD_PLATFORMS=$KUBE_BUILD_PLATFORMS
+      run: make quick-release CGO_ENABLED=1 KUBE_CGO_OVERRIDES="kube-apiserver kube-controller-manager kube-scheduler kubelet" KUBE_BUILD_PLATFORMS=$KUBE_BUILD_PLATFORMS GOFLAGS=$GOFLAGS
     - name: Calculate checksums
       id: calculate_checksums
       shell: bash
diff --git a/build/build-image/Dockerfile b/build/build-image/Dockerfile
index 4473935769b76..348bfe2a6e664 100644
--- a/build/build-image/Dockerfile
+++ b/build/build-image/Dockerfile
@@ -55,3 +55,8 @@ ADD rsyncd.password /
 RUN chmod a+r /rsyncd.password
 ADD rsyncd.sh /
 RUN chmod a+rx /rsyncd.sh
+
+# Enable fips build
+ENV GOEXPERIMENT=boringcrypto
+# Enable debug to keep symbols around, allowing us to do go tool nm
+ENV DBG=1
diff --git a/cmd/kube-apiserver/fips.go b/cmd/kube-apiserver/fips.go
new file mode 100644
index 0000000000000..4e3d18460d97a
--- /dev/null
+++ b/cmd/kube-apiserver/fips.go
@@ -0,0 +1,6 @@
+//go:build fips
+
+package main
+
+// enforce fips compliance if boringcrypto is enabled
+import _ "crypto/tls/fipsonly"
diff --git a/cmd/kube-controller-manager/fips.go b/cmd/kube-controller-manager/fips.go
new file mode 100644
index 0000000000000..4e3d18460d97a
--- /dev/null
+++ b/cmd/kube-controller-manager/fips.go
@@ -0,0 +1,6 @@
+//go:build fips
+
+package main
+
+// enforce fips compliance if boringcrypto is enabled
+import _ "crypto/tls/fipsonly"
diff --git a/cmd/kube-scheduler/fips.go b/cmd/kube-scheduler/fips.go
new file mode 100644
index 0000000000000..4e3d18460d97a
--- /dev/null
+++ b/cmd/kube-scheduler/fips.go
@@ -0,0 +1,6 @@
+//go:build fips
+
+package main
+
+// enforce fips compliance if boringcrypto is enabled
+import _ "crypto/tls/fipsonly"
diff --git a/cmd/kubectl/fips.go b/cmd/kubectl/fips.go
new file mode 100644
index 0000000000000..4e3d18460d97a
--- /dev/null
+++ b/cmd/kubectl/fips.go
@@ -0,0 +1,6 @@
+//go:build fips
+
+package main
+
+// enforce fips compliance if boringcrypto is enabled
+import _ "crypto/tls/fipsonly"
diff --git a/cmd/kubelet/fips.go b/cmd/kubelet/fips.go
new file mode 100644
index 0000000000000..4e3d18460d97a
--- /dev/null
+++ b/cmd/kubelet/fips.go
@@ -0,0 +1,6 @@
+//go:build fips
+
+package main
+
+// enforce fips compliance if boringcrypto is enabled
+import _ "crypto/tls/fipsonly"

From 4171736df343b9513c4418c193ac778586f69611 Mon Sep 17 00:00:00 2001
From: xyz-li <hui0787411@163.com>
Date: Fri, 20 Dec 2024 10:09:25 +0800
Subject: [PATCH 5/6] [fix] fix ds controller deletes pod when not match
 RequiredDuringSchedulingIgnoredDuringExecution

Signed-off-by: xyz-li <hui0787411@163.com>

datadog:patch
---
 pkg/controller/daemon/daemon_controller.go    | 35 +++++++-
 .../daemon/daemon_controller_test.go          | 80 ++++++++++++++++++-
 2 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/pkg/controller/daemon/daemon_controller.go b/pkg/controller/daemon/daemon_controller.go
index 1a370a2050b7c..691a046922d48 100644
--- a/pkg/controller/daemon/daemon_controller.go
+++ b/pkg/controller/daemon/daemon_controller.go
@@ -1289,8 +1289,14 @@ func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) {
 	}
 
 	taints := node.Spec.Taints
-	fitsNodeName, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints)
-	if !fitsNodeName || !fitsNodeAffinity {
+	fitsNodeName := len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name
+	if !fitsNodeName {
+		return false, false
+	}
+
+	fitsNodeName, fitsNodeSelector, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints)
+
+	if !fitsNodeName || !fitsNodeSelector {
 		return false, false
 	}
 
@@ -1302,14 +1308,35 @@ func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) {
 		return false, !hasUntoleratedTaint
 	}
 
+	if !fitsNodeAffinity {
+		// IgnoredDuringExecution means that if the node labels change after Kubernetes schedules the Pod, the Pod continues to run.
+		return false, true
+	}
+
 	return true, true
 }
 
 // predicates checks if a DaemonSet's pod can run on a node.
-func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeAffinity, fitsTaints bool) {
+func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeSelector, fitsNodeAffinity, fitsTaints bool) {
 	fitsNodeName = len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name
+
+	if len(pod.Spec.NodeSelector) > 0 {
+		selector := labels.SelectorFromSet(pod.Spec.NodeSelector)
+		fitsNodeSelector = selector.Matches(labels.Set(node.Labels))
+	} else {
+		fitsNodeSelector = true
+	}
+
+	if pod.Spec.Affinity != nil &&
+		pod.Spec.Affinity.NodeAffinity != nil &&
+		pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
+		affinity := nodeaffinity.NewLazyErrorNodeSelector(pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution)
+		fitsNodeAffinity, _ = affinity.Match(node)
+	} else {
+		fitsNodeAffinity = true
+	}
+
 	// Ignore parsing errors for backwards compatibility.
-	fitsNodeAffinity, _ = nodeaffinity.GetRequiredNodeAffinity(pod).Match(node)
 	_, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
 		return t.Effect == v1.TaintEffectNoExecute || t.Effect == v1.TaintEffectNoSchedule
 	})
diff --git a/pkg/controller/daemon/daemon_controller_test.go b/pkg/controller/daemon/daemon_controller_test.go
index 1f26d2a416b29..89b0a1cc711e4 100644
--- a/pkg/controller/daemon/daemon_controller_test.go
+++ b/pkg/controller/daemon/daemon_controller_test.go
@@ -1640,6 +1640,52 @@ func TestNodeAffinityDaemonLaunchesPods(t *testing.T) {
 	}
 }
 
+// RequiredDuringSchedulingIgnoredDuringExecution means that if the node labels change after Kubernetes schedules the Pod, the Pod continues to run.
+func TestNodeAffinityAndChangeNodeLabels(t *testing.T) {
+	logger, _ := ktesting.NewTestContext(t)
+	for _, strategy := range updateStrategies() {
+		daemon := newDaemonSet("foo")
+		daemon.Spec.UpdateStrategy = *strategy
+		daemon.Spec.Template.Spec.Affinity = &v1.Affinity{
+			NodeAffinity: &v1.NodeAffinity{
+				RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+					NodeSelectorTerms: []v1.NodeSelectorTerm{
+						{
+							MatchExpressions: []v1.NodeSelectorRequirement{
+								{
+									Key:      "color",
+									Operator: v1.NodeSelectorOpIn,
+									Values:   []string{simpleNodeLabel["color"]},
+								},
+							},
+						},
+					},
+				},
+			},
+		}
+		_, ctx := ktesting.NewTestContext(t)
+
+		manager, podControl, _, err := newTestController(ctx, daemon)
+		if err != nil {
+			t.Fatalf("error creating DaemonSetsController: %v", err)
+		}
+		node1 := newNode("node-1", simpleNodeLabel)
+		node2 := newNode("node-2", simpleNodeLabel)
+		manager.nodeStore.Add(node1)
+		manager.nodeStore.Add(node2)
+		err = manager.dsStore.Add(daemon)
+		if err != nil {
+			t.Fatal(err)
+		}
+		expectSyncDaemonSets(t, manager, daemon, podControl, 2, 0, 0)
+		oldNode := node1.DeepCopy()
+		node1.Labels = nil
+		manager.updateNode(logger, oldNode, node1)
+		manager.nodeStore.Add(newNode("node-3", nil))
+		expectSyncDaemonSets(t, manager, daemon, podControl, 2, 0, 0)
+	}
+}
+
 func TestNumberReadyStatus(t *testing.T) {
 	for _, strategy := range updateStrategies() {
 		ds := newDaemonSet("foo")
@@ -2284,7 +2330,7 @@ func TestNodeShouldRunDaemonPod(t *testing.T) {
 			shouldContinueRunning: true,
 		},
 		{
-			predicateName: "ErrPodAffinityNotMatch",
+			predicateName: "PodAffinityNotMatchDuringExecution",
 			ds: &apps.DaemonSet{
 				Spec: apps.DaemonSetSpec{
 					Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
@@ -2315,7 +2361,7 @@ func TestNodeShouldRunDaemonPod(t *testing.T) {
 				},
 			},
 			shouldRun:             false,
-			shouldContinueRunning: false,
+			shouldContinueRunning: true,
 		},
 		{
 			predicateName: "ShouldRunDaemonPod",
@@ -2497,6 +2543,36 @@ func TestUpdateNode(t *testing.T) {
 			expectedCreates: func() int { return 0 },
 			preExistingPod:  true,
 		},
+		{
+			test:    "Node labels changed, ds with NodeAffinity ",
+			oldNode: newNode("node1", simpleNodeLabel),
+			newNode: newNode("node1", simpleNodeLabel2),
+			ds: func() *apps.DaemonSet {
+				ds := newDaemonSet("ds")
+				ds.Spec.Template.Spec.Affinity = &v1.Affinity{
+					NodeAffinity: &v1.NodeAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+							NodeSelectorTerms: []v1.NodeSelectorTerm{
+								{
+									MatchExpressions: []v1.NodeSelectorRequirement{
+										{
+											Key:      "color",
+											Operator: v1.NodeSelectorOpIn,
+											Values:   []string{"blue"},
+										},
+									},
+								},
+							},
+						},
+					},
+				}
+				return ds
+			}(),
+			shouldEnqueue: true,
+			expectedCreates: func() int {
+				return 1
+			},
+		},
 	}
 	for _, c := range cases {
 		for _, strategy := range updateStrategies() {

From 8e6ac61e69ee89d20fd712d89a051ba77a989b7a Mon Sep 17 00:00:00 2001
From: Matteo Ruina <matteo.ruina@gmail.com>
Date: Tue, 25 Mar 2025 17:12:43 +0100
Subject: [PATCH 6/6] Conflicts errors don't count against retries
 datadog:patch (#98)

---
 .../pkg/admission/plugin/resourcequota/controller.go     | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/staging/src/k8s.io/apiserver/pkg/admission/plugin/resourcequota/controller.go b/staging/src/k8s.io/apiserver/pkg/admission/plugin/resourcequota/controller.go
index 95c9c84f6f276..5954ba9ee264e 100644
--- a/staging/src/k8s.io/apiserver/pkg/admission/plugin/resourcequota/controller.go
+++ b/staging/src/k8s.io/apiserver/pkg/admission/plugin/resourcequota/controller.go
@@ -225,7 +225,7 @@ func (e *quotaEvaluator) checkAttributes(ns string, admissionAttributes []*admis
 //     updates failed on conflict errors and we have retries left, re-get the failed quota from our cache for the latest version
 //     and recurse into this method with the subset.  It's safe for us to evaluate ONLY the subset, because the other quota
 //     documents for these waiters have already been evaluated.  Step 1, will mark all the ones that should already have succeeded.
-func (e *quotaEvaluator) checkQuotas(quotas []corev1.ResourceQuota, admissionAttributes []*admissionWaiter, remainingRetries int) {
+func (e *quotaEvaluator) checkQuotas(quotas []corev1.ResourceQuota, admissionAttributes []*admissionWaiter, retries int) {
 	// yet another copy to compare against originals to see if we actually have deltas
 	originalQuotas, err := copyQuotas(quotas)
 	if err != nil {
@@ -277,6 +277,7 @@ func (e *quotaEvaluator) checkQuotas(quotas []corev1.ResourceQuota, admissionAtt
 	// 3. if the quota changed and the update fails, add the original to a retry list
 	var updatedFailedQuotas []corev1.ResourceQuota
 	var lastErr error
+	remainingRetries := retries - 1
 	for i := range quotas {
 		newQuota := quotas[i]
 
@@ -286,6 +287,10 @@ func (e *quotaEvaluator) checkQuotas(quotas []corev1.ResourceQuota, admissionAtt
 		}
 
 		if err := e.quotaAccessor.UpdateQuotaStatus(&newQuota); err != nil {
+			// If there is a transient update error, it doesn't count against the retry quota
+			if apierrors.IsConflict(err) {
+				remainingRetries = retries
+			}
 			updatedFailedQuotas = append(updatedFailedQuotas, newQuota)
 			lastErr = err
 		}
@@ -338,7 +343,7 @@ func (e *quotaEvaluator) checkQuotas(quotas []corev1.ResourceQuota, admissionAtt
 			}
 		}
 	}
-	e.checkQuotas(quotasToCheck, admissionAttributes, remainingRetries-1)
+	e.checkQuotas(quotasToCheck, admissionAttributes, remainingRetries)
 }
 
 func copyQuotas(in []corev1.ResourceQuota) ([]corev1.ResourceQuota, error) {