From 17664e133fd54c86da4c44962f4f1ad7c248f5ab Mon Sep 17 00:00:00 2001 From: neaggarwMS <31906480+neaggarwMS@users.noreply.github.com> Date: Mon, 9 May 2022 13:06:01 -0700 Subject: [PATCH 1/5] Fix CNS Reconciling and ASsignmentMode handling for AKS Swift --- cns/service/main.go | 54 +++++++++++++++++------- cns/singletenantcontroller/reconciler.go | 12 ++++-- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index f74e816f74..ef12d93e11 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -857,10 +857,21 @@ func reconcileInitialCNSState(ctx context.Context, cli nodeNetworkConfigGetter, // Convert to CreateNetworkContainerRequest for i := range nnc.Status.NetworkContainers { - ncRequest, err := kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) + var ncRequest *cns.CreateNetworkContainerRequest + var err error + switch nnc.Status.NetworkContainers[i].AssignmentMode { + case v1alpha.Dynamic: + ncRequest, err = kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) + case v1alpha.Static: + default: // For backward compatibility, default will be treated as Static too. + ncRequest, err = kubecontroller.CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) + } + if err != nil { - return errors.Wrap(err, "failed to convert NNC status to network container request") + return errors.Wrapf(err, "failed to convert NNC status to network container request, " + + "assignmentMode: %s", nnc.Status.NetworkContainers[i].AssignmentMode) } + // rebuild CNS state podInfoByIP, err := podInfoByIPProvider.PodInfoByIP() if err != nil { @@ -949,23 +960,34 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor // reconcile initial CNS state from CNI or apiserver. - // apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for - // aks addons to come up so retry a bit more aggresively here. - // will retry 10 times maxing out at a minute taking about 8 minutes before it gives up. - attempt := 0 - err = retry.Do(func() error { - attempt++ - logger.Printf("reconciling initial CNS state attempt: %d", attempt) - err = reconcileInitialCNSState(ctx, scopedcli, httpRestServiceImplementation, podInfoByIPProvider) + // Only reconcile if there are any existing Pods using NC ips, + // else let the goal state be updated using a regular NNC Reconciler loop + podInfoByIP, err := podInfoByIPProvider.PodInfoByIP() + if err != nil { + return errors.Wrap(err, "failed to provide PodInfoByIP") + } + if len(podInfoByIP) > 0 { + logger.Printf("Reconciling initial CNS state as PodInfoByIP is not empty: %d", len(podInfoByIP)) + + // apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for + // aks addons to come up so retry a bit more aggresively here. + // will retry 10 times maxing out at a minute taking about 8 minutes before it gives up. + attempt := 0 + err = retry.Do(func() error { + attempt++ + logger.Printf("reconciling initial CNS state attempt: %d", attempt) + err = reconcileInitialCNSState(ctx, scopedcli, httpRestServiceImplementation, podInfoByIPProvider) + if err != nil { + logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err) + } + return errors.Wrap(err, "failed to initialize CNS state") + }, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute)) if err != nil { - logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err) + return err } - return errors.Wrap(err, "failed to initialize CNS state") - }, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute)) - if err != nil { - return err + logger.Printf("reconciled initial CNS state after %d attempts", attempt) } - logger.Printf("reconciled initial CNS state after %d attempts", attempt) + // start the pool Monitor before the Reconciler, since it needs to be ready to receive an // NodeNetworkConfig update by the time the Reconciler tries to send it. diff --git a/cns/singletenantcontroller/reconciler.go b/cns/singletenantcontroller/reconciler.go index 7ebbbdb308..3a6125b741 100644 --- a/cns/singletenantcontroller/reconciler.go +++ b/cns/singletenantcontroller/reconciler.go @@ -87,14 +87,17 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco // in dynamic, we will also push this NNC to the IPAM Pool Monitor when we're done. listenersToNotify = append(listenersToNotify, r.ipampoolmonitorcli) case v1alpha.Static: + default: // For backward compatibility, default will be treated as Static too. req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) - default: - // unrecognized mode, fail out - err = errors.Errorf("unknown NetworkContainer AssignmentMode %s", string(nnc.Status.NetworkContainers[i].AssignmentMode)) } + if err != nil { - return reconcile.Result{}, errors.Wrap(err, "failed to generate CreateNCRequest from NC") + logger.Errorf("[cns-rc] failed to generate CreateNCRequest from NC: %v, assignmentMode %s", err, + nnc.Status.NetworkContainers[i].AssignmentMode) + return reconcile.Result{}, errors.Wrapf(err, "failed to generate CreateNCRequest from NC " + + "assignmentMode %s", nnc.Status.NetworkContainers[i].AssignmentMode) } + responseCode := r.cnscli.CreateOrUpdateNetworkContainerInternal(req) if err := restserver.ResponseCodeToError(responseCode); err != nil { logger.Errorf("[cns-rc] Error creating or updating NC in reconcile: %v", err) @@ -113,6 +116,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco } // we have received and pushed an NNC update, we are "Started" + logger.Printf("[cns-rc] CNS NNC Reconciler Started") r.once.Do(func() { close(r.started) }) return reconcile.Result{}, nil } From 699752e0c395fb7b85ce8e26c378b94b5fb498ed Mon Sep 17 00:00:00 2001 From: neaggarwMS <31906480+neaggarwMS@users.noreply.github.com> Date: Mon, 9 May 2022 13:10:43 -0700 Subject: [PATCH 2/5] Fixed the handling of default mode == Dynamic --- cns/service/main.go | 7 ++++--- cns/singletenantcontroller/reconciler.go | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index ef12d93e11..9c4fe27de1 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -860,11 +860,12 @@ func reconcileInitialCNSState(ctx context.Context, cli nodeNetworkConfigGetter, var ncRequest *cns.CreateNetworkContainerRequest var err error switch nnc.Status.NetworkContainers[i].AssignmentMode { - case v1alpha.Dynamic: - ncRequest, err = kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) case v1alpha.Static: - default: // For backward compatibility, default will be treated as Static too. ncRequest, err = kubecontroller.CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) + case v1alpha.Dynamic: + default: // For backward compatibility, default will be treated as Dynamic too. + ncRequest, err = kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) + } if err != nil { diff --git a/cns/singletenantcontroller/reconciler.go b/cns/singletenantcontroller/reconciler.go index 3a6125b741..d7a4fffae3 100644 --- a/cns/singletenantcontroller/reconciler.go +++ b/cns/singletenantcontroller/reconciler.go @@ -82,13 +82,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco var req *cns.CreateNetworkContainerRequest var err error switch nnc.Status.NetworkContainers[i].AssignmentMode { + case v1alpha.Static: + req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) case v1alpha.Dynamic: + default: // For backward compatibility, default will be treated as Dynamic too. req, err = CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) // in dynamic, we will also push this NNC to the IPAM Pool Monitor when we're done. listenersToNotify = append(listenersToNotify, r.ipampoolmonitorcli) - case v1alpha.Static: - default: // For backward compatibility, default will be treated as Static too. - req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) + } if err != nil { From 287f81f2aa90d69072a01043a4c03c2d0181d043 Mon Sep 17 00:00:00 2001 From: neaggarwMS <31906480+neaggarwMS@users.noreply.github.com> Date: Wed, 11 May 2022 13:57:25 -0700 Subject: [PATCH 3/5] Incorporate feedback --- cns/service/main.go | 4 +--- cns/singletenantcontroller/reconciler.go | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index 9c4fe27de1..be34315a6a 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -862,14 +862,13 @@ func reconcileInitialCNSState(ctx context.Context, cli nodeNetworkConfigGetter, switch nnc.Status.NetworkContainers[i].AssignmentMode { case v1alpha.Static: ncRequest, err = kubecontroller.CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) - case v1alpha.Dynamic: default: // For backward compatibility, default will be treated as Dynamic too. ncRequest, err = kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) } if err != nil { - return errors.Wrapf(err, "failed to convert NNC status to network container request, " + + return errors.Wrapf(err, "failed to convert NNC status to network container request, "+ "assignmentMode: %s", nnc.Status.NetworkContainers[i].AssignmentMode) } @@ -989,7 +988,6 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn logger.Printf("reconciled initial CNS state after %d attempts", attempt) } - // start the pool Monitor before the Reconciler, since it needs to be ready to receive an // NodeNetworkConfig update by the time the Reconciler tries to send it. go func() { diff --git a/cns/singletenantcontroller/reconciler.go b/cns/singletenantcontroller/reconciler.go index d7a4fffae3..bfc26419fc 100644 --- a/cns/singletenantcontroller/reconciler.go +++ b/cns/singletenantcontroller/reconciler.go @@ -84,7 +84,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco switch nnc.Status.NetworkContainers[i].AssignmentMode { case v1alpha.Static: req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) - case v1alpha.Dynamic: default: // For backward compatibility, default will be treated as Dynamic too. req, err = CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) // in dynamic, we will also push this NNC to the IPAM Pool Monitor when we're done. @@ -95,7 +94,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco if err != nil { logger.Errorf("[cns-rc] failed to generate CreateNCRequest from NC: %v, assignmentMode %s", err, nnc.Status.NetworkContainers[i].AssignmentMode) - return reconcile.Result{}, errors.Wrapf(err, "failed to generate CreateNCRequest from NC " + + return reconcile.Result{}, errors.Wrapf(err, "failed to generate CreateNCRequest from NC "+ "assignmentMode %s", nnc.Status.NetworkContainers[i].AssignmentMode) } From 0e2b9719c9c7aea68e839af97880b69f5ddaa11d Mon Sep 17 00:00:00 2001 From: neaggarwMS <31906480+neaggarwMS@users.noreply.github.com> Date: Fri, 13 May 2022 11:09:15 -0700 Subject: [PATCH 4/5] golint fixes --- cns/service/main.go | 4 ++-- cns/singletenantcontroller/reconciler.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index be34315a6a..395dc6dddf 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -859,12 +859,12 @@ func reconcileInitialCNSState(ctx context.Context, cli nodeNetworkConfigGetter, for i := range nnc.Status.NetworkContainers { var ncRequest *cns.CreateNetworkContainerRequest var err error - switch nnc.Status.NetworkContainers[i].AssignmentMode { + + switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint: exhaustive // skipping dynamic case case v1alpha.Static: ncRequest, err = kubecontroller.CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) default: // For backward compatibility, default will be treated as Dynamic too. ncRequest, err = kubecontroller.CreateNCRequestFromDynamicNC(nnc.Status.NetworkContainers[i]) - } if err != nil { diff --git a/cns/singletenantcontroller/reconciler.go b/cns/singletenantcontroller/reconciler.go index bfc26419fc..eca8a3f5c7 100644 --- a/cns/singletenantcontroller/reconciler.go +++ b/cns/singletenantcontroller/reconciler.go @@ -81,7 +81,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco for i := range nnc.Status.NetworkContainers { var req *cns.CreateNetworkContainerRequest var err error - switch nnc.Status.NetworkContainers[i].AssignmentMode { + switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint: exhaustive // skipping dynamic case case v1alpha.Static: req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) default: // For backward compatibility, default will be treated as Dynamic too. From 595040a45c9270b2207c1cc9b9560059286d52f5 Mon Sep 17 00:00:00 2001 From: neaggarwMS <31906480+neaggarwMS@users.noreply.github.com> Date: Fri, 13 May 2022 13:15:38 -0700 Subject: [PATCH 5/5] Fix for whyNoLint: include an explanation for nolint directive (gocritic) --- cns/service/main.go | 2 +- cns/singletenantcontroller/reconciler.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cns/service/main.go b/cns/service/main.go index 395dc6dddf..26ce881d97 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -860,7 +860,7 @@ func reconcileInitialCNSState(ctx context.Context, cli nodeNetworkConfigGetter, var ncRequest *cns.CreateNetworkContainerRequest var err error - switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint: exhaustive // skipping dynamic case + switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint:exhaustive // skipping dynamic case case v1alpha.Static: ncRequest, err = kubecontroller.CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) default: // For backward compatibility, default will be treated as Dynamic too. diff --git a/cns/singletenantcontroller/reconciler.go b/cns/singletenantcontroller/reconciler.go index eca8a3f5c7..92cfa9082c 100644 --- a/cns/singletenantcontroller/reconciler.go +++ b/cns/singletenantcontroller/reconciler.go @@ -81,7 +81,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco for i := range nnc.Status.NetworkContainers { var req *cns.CreateNetworkContainerRequest var err error - switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint: exhaustive // skipping dynamic case + switch nnc.Status.NetworkContainers[i].AssignmentMode { //nolint:exhaustive // skipping dynamic case case v1alpha.Static: req, err = CreateNCRequestFromStaticNC(nnc.Status.NetworkContainers[i]) default: // For backward compatibility, default will be treated as Dynamic too.