Skip to content
Merged
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -620,10 +620,10 @@ clean: ## Clean build artifacts.
LINT_PKG ?= .

lint: $(GOLANGCI_LINT) ## Fast lint vs default branch showing only new issues.
$(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/...
GOGC=20 $(GOLANGCI_LINT) run --new-from-rev master --timeout 10m -v $(LINT_PKG)/...

lint-all: $(GOLANGCI_LINT) ## Lint the current branch in entirety.
$(GOLANGCI_LINT) run -v $(LINT_PKG)/...
GOGC=20 $(GOLANGCI_LINT) run -v $(LINT_PKG)/...


FMT_PKG ?= cni cns npm
Expand Down
35 changes: 30 additions & 5 deletions cns/ipampool/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@ import (
)

const (
subnetLabel = "subnet"
subnetCIDRLabel = "subnet_cidr"
podnetARMIDLabel = "podnet_arm_id"
customerMetricLabel = "customer_metric"
customerMetricLabelValue = "customer metric"
subnetLabel = "subnet"
subnetCIDRLabel = "subnet_cidr"
podnetARMIDLabel = "podnet_arm_id"
customerMetricLabel = "customer_metric"
customerMetricLabelValue = "customer metric"
subnetExhaustionStateLabel = "subnet_exhaustion_state"
subnetIPExhausted = 1
subnetIPNotExhausted = 0
)

var (
Expand Down Expand Up @@ -102,6 +105,21 @@ var (
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
)
ipamSubnetExhaustionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_subnet_exhaustion_state",
Help: "IPAM view of subnet exhaustion state",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
)
ipamSubnetExhaustionCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cx_ipam_subnet_exhaustion_state_count_total",
Help: "Count of the number of times the ipam pool monitor sees subnet exhaustion",
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel, subnetExhaustionStateLabel},
)
)

func init() {
Expand All @@ -117,6 +135,8 @@ func init() {
ipamPrimaryIPCount,
ipamRequestedIPConfigCount,
ipamTotalIPCount,
ipamSubnetExhaustionState,
ipamSubnetExhaustionCount,
)
}

Expand All @@ -133,4 +153,9 @@ func observeIPPoolState(state ipPoolState, meta metaState) {
ipamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
ipamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
ipamTotalIPCount.WithLabelValues(labels...).Set(float64(state.totalIPs))
if meta.exhausted {
ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPExhausted))
} else {
ipamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(subnetIPNotExhausted))
}
}
22 changes: 15 additions & 7 deletions cns/ipampool/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ipampool
import (
"context"
"fmt"
"strconv"
"sync"
"time"

Expand All @@ -13,6 +14,7 @@ import (
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)

const (
Expand Down Expand Up @@ -99,6 +101,10 @@ func (pm *Monitor) Start(ctx context.Context) error {
case css := <-pm.cssSource: // received an updated ClusterSubnetState
pm.metastate.exhausted = css.Status.Exhausted
logger.Printf("subnet exhausted status = %t", pm.metastate.exhausted)
ipamSubnetExhaustionCount.With(prometheus.Labels{
subnetLabel: pm.metastate.subnet, subnetCIDRLabel: pm.metastate.subnetCIDR,
podnetARMIDLabel: pm.metastate.subnetARMID, subnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
}).Inc()
select {
default:
// if we have NOT initialized and enter this case, we continue out of this iteration and let the for loop begin again.
Expand Down Expand Up @@ -166,8 +172,9 @@ func buildIPPoolState(ips map[string]cns.IPConfigurationStatus, spec v1alpha.Nod
totalIPs: int64(len(ips)),
requestedIPs: spec.RequestedIPCount,
}
for _, v := range ips {
switch v.GetState() {
for i := range ips {
ip := ips[i]
switch ip.GetState() {
case types.Assigned:
state.allocatedToPods++
case types.Available:
Expand Down Expand Up @@ -266,7 +273,7 @@ func (pm *Monitor) increasePoolSize(ctx context.Context, meta metaState, state i

if _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec); err != nil {
// caller will retry to update the CRD again
return err
return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
}

logger.Printf("[ipam-pool-monitor] Increasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
Expand Down Expand Up @@ -308,7 +315,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
logger.Printf("[ipam-pool-monitor] Marking IPs as PendingRelease, ipsToBeReleasedCount %d", decreaseIPCountBy)
var err error
if pendingIPAddresses, err = pm.httpService.MarkIPAsPendingRelease(int(decreaseIPCountBy)); err != nil {
return err
return errors.Wrap(err, "marking IPs that are pending release")
}

newIpsMarkedAsPending = true
Expand All @@ -330,7 +337,7 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
if err != nil {
// caller will retry to update the CRD again
return err
return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
}

logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
Expand All @@ -355,7 +362,7 @@ func (pm *Monitor) cleanPendingRelease(ctx context.Context) error {
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
if err != nil {
// caller will retry to update the CRD again
return err
return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
}

logger.Printf("[ipam-pool-monitor] cleanPendingRelease: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
Expand All @@ -374,7 +381,8 @@ func (pm *Monitor) createNNCSpecForCRD() v1alpha.NodeNetworkConfigSpec {

// Get All Pending IPs from CNS and populate it again.
pendingIPs := pm.httpService.GetPendingReleaseIPConfigs()
for _, pendingIP := range pendingIPs {
for i := range pendingIPs {
pendingIP := pendingIPs[i]
spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIP.ID)
}

Expand Down
25 changes: 25 additions & 0 deletions cns/kubecontroller/clustersubnetstate/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package clustersubnetstate

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// Constants to describe the error state boolean values for the cluster subnet state
const (
cssReconcilerCRDWatcherStateLabel = "css_reconciler_crd_watcher_status"
)

var cssReconcilerErrorCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cluster_subnet_state_reconciler_crd_watcher_status_count_total",
Help: "Number of errors in reconciler while watching CRD for subnet exhaustion",
},
[]string{cssReconcilerCRDWatcherStateLabel},
)

func init() {
metrics.Registry.MustRegister(
cssReconcilerErrorCount,
)
}
3 changes: 3 additions & 0 deletions cns/kubecontroller/clustersubnetstate/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand All @@ -22,8 +23,10 @@ type Reconciler struct {
func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
css, err := r.Cli.Get(ctx, req.NamespacedName)
if err != nil {
cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "failed"}).Inc()
return reconcile.Result{}, errors.Wrapf(err, "failed to get css %s", req.String())
}
cssReconcilerErrorCount.With(prometheus.Labels{cssReconcilerCRDWatcherStateLabel: "succeeded"}).Inc()
r.Sink <- *css
return reconcile.Result{}, nil
}
Expand Down