diff --git a/Makefile b/Makefile index d50895a184..d7c32f8700 100644 --- a/Makefile +++ b/Makefile @@ -620,10 +620,10 @@ clean: ## Clean build artifacts. LINT_PKG ?= . lint: $(GOLANGCI_LINT) ## Fast lint vs default branch showing only new issues. - $(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/... + GOGC=20 $(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/... lint-all: $(GOLANGCI_LINT) ## Lint the current branch in entirety. - $(GOLANGCI_LINT) run -v $(LINT_PKG)/... + GOGC=20 $(GOLANGCI_LINT) run -v $(LINT_PKG)/... FMT_PKG ?= cni cns npm diff --git a/cns/ipampool/metrics.go b/cns/ipampool/metrics.go index d6d6a2d1c4..01cf9ce652 100644 --- a/cns/ipampool/metrics.go +++ b/cns/ipampool/metrics.go @@ -6,11 +6,14 @@ import ( ) const ( - subnetLabel = "subnet" - subnetCIDRLabel = "subnet_cidr" - podnetARMIDLabel = "podnet_arm_id" - customerMetricLabel = "customer_metric" - customerMetricLabelValue = "customer metric" + subnetLabel = "subnet" + subnetCIDRLabel = "subnet_cidr" + podnetARMIDLabel = "podnet_arm_id" + customerMetricLabel = "customer_metric" + customerMetricLabelValue = "customer metric" + subnetExhaustionStateLabel = "subnet_exhaustion_state" + subnetIPExhausted = 1 + subnetIPNotExhausted = 0 ) var ( @@ -102,6 +105,21 @@ var ( }, []string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel}, ) + ipamSubnetExhaustionState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cx_ipam_subnet_exhaustion_state", + Help: "IPAM view of subnet exhaustion state", + ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue}, + }, + []string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel}, + ) + ipamSubnetExhaustionCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "cx_ipam_subnet_exhaustion_state_count_total", + Help: "Count of the number of times the ipam pool monitor sees subnet exhaustion", + }, + []string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel, subnetExhaustionStateLabel}, + ) ) func init() { @@ -117,6 +135,8 @@ func init() { ipamPrimaryIPCount, ipamRequestedIPConfigCount, ipamTotalIPCount, + ipamSubnetExhaustionState, + ipamSubnetExhaustionCount, ) } @@ -133,4 +153,9 @@ func observeIPPoolState(state ipPoolState, meta metaState) { ipamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses))) ipamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs)) ipamTotalIPCount.WithLabelValues(labels...).Set(float64(state.totalIPs)) + if meta.exhausted { + ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPExhausted)) + } else { + ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPNotExhausted)) + } } diff --git a/cns/ipampool/monitor.go b/cns/ipampool/monitor.go index 84febb5a87..6c0bbed41c 100644 --- a/cns/ipampool/monitor.go +++ b/cns/ipampool/monitor.go @@ -3,6 +3,7 @@ package ipampool import ( "context" "fmt" + "strconv" "sync" "time" @@ -13,6 +14,7 @@ import ( "github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1" "github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" ) const ( @@ -99,6 +101,10 @@ func (pm *Monitor) Start(ctx context.Context) error { case css := <-pm.cssSource: // received an updated ClusterSubnetState pm.metastate.exhausted = css.Status.Exhausted logger.Printf("subnet exhausted status = %t", pm.metastate.exhausted) + ipamSubnetExhaustionCount.With(prometheus.Labels{ + subnetLabel: pm.metastate.subnet, subnetCIDRLabel: pm.metastate.subnetCIDR, + podnetARMIDLabel: pm.metastate.subnetARMID, subnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted), + }).Inc() select { default: // if we have NOT initialized and enter this case, we continue out of this iteration and let the for loop begin again. @@ -166,8 +172,9 @@ func buildIPPoolState(ips map[string]cns.IPConfigurationStatus, spec v1alpha.Nod totalIPs: int64(len(ips)), requestedIPs: spec.RequestedIPCount, } - for _, v := range ips { - switch v.GetState() { + for i := range ips { + ip := ips[i] + switch ip.GetState() { case types.Assigned: state.allocatedToPods++ case types.Available: @@ -266,7 +273,7 @@ func (pm *Monitor) increasePoolSize(ctx context.Context, meta metaState, state i if _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec); err != nil { // caller will retry to update the CRD again - return err + return errors.Wrap(err, "executing UpdateSpec with NNC CLI") } logger.Printf("[ipam-pool-monitor] Increasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec) @@ -308,7 +315,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i logger.Printf("[ipam-pool-monitor] Marking IPs as PendingRelease, ipsToBeReleasedCount %d", decreaseIPCountBy) var err error if pendingIPAddresses, err = pm.httpService.MarkIPAsPendingRelease(int(decreaseIPCountBy)); err != nil { - return err + return errors.Wrap(err, "marking IPs that are pending release") } newIpsMarkedAsPending = true @@ -330,7 +337,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec) if err != nil { // caller will retry to update the CRD again - return err + return errors.Wrap(err, "executing UpdateSpec with NNC CLI") } logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec) @@ -355,7 +362,7 @@ func (pm *Monitor) cleanPendingRelease(ctx context.Context) error { _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec) if err != nil { // caller will retry to update the CRD again - return err + return errors.Wrap(err, "executing UpdateSpec with NNC CLI") } logger.Printf("[ipam-pool-monitor] cleanPendingRelease: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec) @@ -374,7 +381,8 @@ func (pm *Monitor) createNNCSpecForCRD() v1alpha.NodeNetworkConfigSpec { // Get All Pending IPs from CNS and populate it again. pendingIPs := pm.httpService.GetPendingReleaseIPConfigs() - for _, pendingIP := range pendingIPs { + for i := range pendingIPs { + pendingIP := pendingIPs[i] spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIP.ID) } diff --git a/cns/kubecontroller/clustersubnetstate/metrics.go b/cns/kubecontroller/clustersubnetstate/metrics.go new file mode 100644 index 0000000000..35a29ac10e --- /dev/null +++ b/cns/kubecontroller/clustersubnetstate/metrics.go @@ -0,0 +1,25 @@ +package clustersubnetstate + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Constants to describe the error state boolean values for the cluster subnet state +const ( + cssReconcilerCRDWatcherStateLabel = "css_reconciler_crd_watcher_status" +) + +var cssReconcilerErrorCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "cluster_subnet_state_reconciler_crd_watcher_status_count_total", + Help: "Number of errors in reconciler while watching CRD for subnet exhaustion", + }, + []string{cssReconcilerCRDWatcherStateLabel}, +) + +func init() { + metrics.Registry.MustRegister( + cssReconcilerErrorCount, + ) +} diff --git a/cns/kubecontroller/clustersubnetstate/reconciler.go b/cns/kubecontroller/clustersubnetstate/reconciler.go index 9c81738501..1afa27e259 100644 --- a/cns/kubecontroller/clustersubnetstate/reconciler.go +++ b/cns/kubecontroller/clustersubnetstate/reconciler.go @@ -5,6 +5,7 @@ import ( "github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -22,8 +23,10 @@ type Reconciler struct { func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { css, err := r.Cli.Get(ctx, req.NamespacedName) if err != nil { + cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "failed"}).Inc() return reconcile.Result{}, errors.Wrapf(err, "failed to get css %s", req.String()) } + cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "succeeded"}).Inc() r.Sink <- *css return reconcile.Result{}, nil }