Azure · nairashu · Sep 28, 2022 · Sep 16, 2022 · Sep 16, 2022 · Sep 19, 2022
@@ -620,10 +620,10 @@ clean: ## Clean build artifacts.
 LINT_PKG ?= .
 
 lint: $(GOLANGCI_LINT) ## Fast lint vs default branch showing only new issues.
-	$(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/...
+	GOGC=20 $(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/...
 
 lint-all: $(GOLANGCI_LINT) ## Lint the current branch in entirety.
-	$(GOLANGCI_LINT) run -v $(LINT_PKG)/...
+	GOGC=20 $(GOLANGCI_LINT) run -v $(LINT_PKG)/...
 
 
 FMT_PKG ?= cni cns npm

@@ -6,11 +6,14 @@ import (
 )
 
 const (
-	subnetLabel              = "subnet"
-	subnetCIDRLabel          = "subnet_cidr"
-	podnetARMIDLabel         = "podnet_arm_id"
-	customerMetricLabel      = "customer_metric"
-	customerMetricLabelValue = "customer metric"
+	subnetLabel                = "subnet"
+	subnetCIDRLabel            = "subnet_cidr"
+	podnetARMIDLabel           = "podnet_arm_id"
+	customerMetricLabel        = "customer_metric"
+	customerMetricLabelValue   = "customer metric"
+	subnetExhaustionStateLabel = "subnet_exhaustion_state"
+	subnetIPExhausted          = 1
+	subnetIPNotExhausted       = 0
 )
 
 var (
@@ -102,6 +105,21 @@ var (
 		},
 		[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
 	)
+	ipamSubnetExhaustionState = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name:        "cx_ipam_subnet_exhaustion_state",
+			Help:        "IPAM view of subnet exhaustion state",
+			ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
+		},
+		[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
+	)
+	ipamSubnetExhaustionCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "cx_ipam_subnet_exhaustion_state_count_total",
+			Help: "Count of the number of times the ipam pool monitor sees subnet exhaustion",
+		},
+		[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel, subnetExhaustionStateLabel},
+	)
 )
 
 func init() {
@@ -117,6 +135,8 @@ func init() {
 		ipamPrimaryIPCount,
 		ipamRequestedIPConfigCount,
 		ipamTotalIPCount,
+		ipamSubnetExhaustionState,
+		ipamSubnetExhaustionCount,
 	)
 }
 
@@ -133,4 +153,9 @@ func observeIPPoolState(state ipPoolState, meta metaState) {
 	ipamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
 	ipamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
 	ipamTotalIPCount.WithLabelValues(labels...).Set(float64(state.totalIPs))
+	if meta.exhausted {
+		ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPExhausted))
+	} else {
+		ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPNotExhausted))
+	}
 }
@@ -3,6 +3,7 @@ package ipampool
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"sync"
 	"time"
 
@@ -13,6 +14,7 @@ import (
 	"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
 	"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
 	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
 )
 
 const (
@@ -99,6 +101,10 @@ func (pm *Monitor) Start(ctx context.Context) error {
 		case css := <-pm.cssSource: // received an updated ClusterSubnetState
 			pm.metastate.exhausted = css.Status.Exhausted
 			logger.Printf("subnet exhausted status = %t", pm.metastate.exhausted)
+			ipamSubnetExhaustionCount.With(prometheus.Labels{
+				subnetLabel: pm.metastate.subnet, subnetCIDRLabel: pm.metastate.subnetCIDR,
+				podnetARMIDLabel: pm.metastate.subnetARMID, subnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
+			}).Inc()
 			select {
 			default:
 				// if we have NOT initialized and enter this case, we continue out of this iteration and let the for loop begin again.
@@ -166,8 +172,9 @@ func buildIPPoolState(ips map[string]cns.IPConfigurationStatus, spec v1alpha.Nod
 		totalIPs:     int64(len(ips)),
 		requestedIPs: spec.RequestedIPCount,
 	}
-	for _, v := range ips {
-		switch v.GetState() {
+	for i := range ips {
+		ip := ips[i]
+		switch ip.GetState() {
 		case types.Assigned:
 			state.allocatedToPods++
 		case types.Available:
@@ -266,7 +273,7 @@ func (pm *Monitor) increasePoolSize(ctx context.Context, meta metaState, state i
 
 	if _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec); err != nil {
 		// caller will retry to update the CRD again
-		return err
+		return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
 	}
 
 	logger.Printf("[ipam-pool-monitor] Increasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
@@ -308,7 +315,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
 		logger.Printf("[ipam-pool-monitor] Marking IPs as PendingRelease, ipsToBeReleasedCount %d", decreaseIPCountBy)
 		var err error
 		if pendingIPAddresses, err = pm.httpService.MarkIPAsPendingRelease(int(decreaseIPCountBy)); err != nil {
-			return err
+			return errors.Wrap(err, "marking IPs that are pending release")
 		}
 
 		newIpsMarkedAsPending = true
@@ -330,7 +337,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
 	_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
 	if err != nil {
 		// caller will retry to update the CRD again
-		return err
+		return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
 	}
 
 	logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
@@ -355,7 +362,7 @@ func (pm *Monitor) cleanPendingRelease(ctx context.Context) error {
 	_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
 	if err != nil {
 		// caller will retry to update the CRD again
-		return err
+		return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
 	}
 
 	logger.Printf("[ipam-pool-monitor] cleanPendingRelease: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
@@ -374,7 +381,8 @@ func (pm *Monitor) createNNCSpecForCRD() v1alpha.NodeNetworkConfigSpec {
 
 	// Get All Pending IPs from CNS and populate it again.
 	pendingIPs := pm.httpService.GetPendingReleaseIPConfigs()
-	for _, pendingIP := range pendingIPs {
+	for i := range pendingIPs {
+		pendingIP := pendingIPs[i]
 		spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIP.ID)
 	}
 

@@ -0,0 +1,25 @@
+package clustersubnetstate
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+)
+
+// Constants to describe the error state boolean values for the cluster subnet state
+const (
+	cssReconcilerCRDWatcherStateLabel = "css_reconciler_crd_watcher_status"
+)
+
+var cssReconcilerErrorCount = prometheus.NewCounterVec(
+	prometheus.CounterOpts{
+		Name: "cluster_subnet_state_reconciler_crd_watcher_status_count_total",
+		Help: "Number of errors in reconciler while watching CRD for subnet exhaustion",
+	},
+	[]string{cssReconcilerCRDWatcherStateLabel},
+)
+
+func init() {
+	metrics.Registry.MustRegister(
+		cssReconcilerErrorCount,
+	)
+}
@@ -5,6 +5,7 @@ import (
 
 	"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
 	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
 	"k8s.io/apimachinery/pkg/types"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
@@ -22,8 +23,10 @@ type Reconciler struct {
 func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
 	css, err := r.Cli.Get(ctx, req.NamespacedName)
 	if err != nil {
+		cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "failed"}).Inc()
 		return reconcile.Result{}, errors.Wrapf(err, "failed to get css %s", req.String())
 	}
+	cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "succeeded"}).Inc()
 	r.Sink <- *css
 	return reconcile.Result{}, nil
 }