From 79b327d66a75b6ad97b057152cec311e3075eb93 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 5 Jun 2023 16:29:30 -0700 Subject: [PATCH 01/11] feat: remove metric for max members in ipset and add metric for total pod IPs in cluster --- npm/metrics/ipsets.go | 84 ++++++-------------------- npm/metrics/ipsets_test.go | 11 ++++ npm/metrics/ipsets_windows_test.go | 95 ------------------------------ npm/metrics/prometheus-metrics.go | 22 +++---- 4 files changed, 39 insertions(+), 173 deletions(-) delete mode 100644 npm/metrics/ipsets_windows_test.go diff --git a/npm/metrics/ipsets.go b/npm/metrics/ipsets.go index d012615c42..47526ddd30 100644 --- a/npm/metrics/ipsets.go +++ b/npm/metrics/ipsets.go @@ -5,10 +5,23 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -var ( - ipsetInventoryMap map[string]int - maxMembers = 0 -) +var ipsetInventoryMap map[string]int + +// AddPod increments the number of Pod IPs. +func AddPod() { + podIPTotal.Inc() +} + +// RemovePod decrements the number of Pod IPs. +func RemovePod() { + podIPTotal.Dec() +} + +// PodCount gets the current number of Pod IPs. +// This function is slow. +func PodCount() (int, error) { + return getValue(podIPTotal) +} // IncNumIPSets increments the number of IPSets. func IncNumIPSets() { @@ -47,13 +60,6 @@ func RecordIPSetExecTime(timer *Timer) { // AddEntryToIPSet increments the number of entries for IPSet setName. // It doesn't ever update the number of IPSets. func AddEntryToIPSet(setName string) { - if util.IsWindowsDP() && ipsetInventoryMap[setName] == maxMembers { - // ipsetInventoryMap[setName] returns 0 if the set doesn't exist in the map, which is good in case maxMembers is 0 - // increase maxMembers if this set previously had the max - maxMembers++ - maxIPSetMembers.Set(float64(maxMembers)) - } - numIPSetEntries.Inc() ipsetInventoryMap[setName]++ // adds the setName with value 1 if it doesn't exist updateIPSetInventory(setName) @@ -63,27 +69,6 @@ func AddEntryToIPSet(setName string) { func RemoveEntryFromIPSet(setName string) { _, exists := ipsetInventoryMap[setName] if exists { - if util.IsWindowsDP() && ipsetInventoryMap[setName] == maxMembers && maxMembers > 0 { - // decrease maxMembers if this set previously had the max AND no other set has the max - wasUniqueMax := true - for s, val := range ipsetInventoryMap { - if setName == s { - continue - } - - if val == maxMembers { - wasUniqueMax = false - break - } - } - - if wasUniqueMax { - // decrease the maxMembers since no other set has the max - maxMembers-- - maxIPSetMembers.Set(float64(maxMembers)) - } - } - numIPSetEntries.Dec() ipsetInventoryMap[setName]-- if ipsetInventoryMap[setName] == 0 { @@ -97,32 +82,6 @@ func RemoveEntryFromIPSet(setName string) { // RemoveAllEntriesFromIPSet sets the number of entries for ipset setName to 0. // It doesn't ever update the number of IPSets. func RemoveAllEntriesFromIPSet(setName string) { - if util.IsWindowsDP() && ipsetInventoryMap[setName] == maxMembers && maxMembers > 0 { - // ipsetInventoryMap[setName] returns 0 if the set doesn't exist in the map - // determine the new maxMembers if this set previously had the max - oldMaxMembers := maxMembers - maxMembers = 0 - for s, val := range ipsetInventoryMap { - if setName == s { - continue - } - - if val > maxMembers { - maxMembers = val - - if val == oldMaxMembers { - // the max didn't change - break - } - } - } - - if oldMaxMembers != maxMembers { - // update max if it changed - maxIPSetMembers.Set(float64(maxMembers)) - } - } - numIPSetEntries.Add(-getEntryCountForIPSet(setName)) delete(ipsetInventoryMap, setName) removeFromIPSetInventory(setName) @@ -137,11 +96,6 @@ func DeleteIPSet(setName string) { // ResetIPSetEntries sets the number of entries to 0 for all IPSets. // It doesn't ever update the number of IPSets. func ResetIPSetEntries() { - if util.IsWindowsDP() { - maxMembers = 0 - maxIPSetMembers.Set(float64(maxMembers)) - } - numIPSetEntries.Set(0) for setName := range ipsetInventoryMap { removeFromIPSetInventory(setName) @@ -170,10 +124,6 @@ func GetNumEntriesForIPSet(setName string) (int, error) { return getVecValue(ipsetInventory, labels) } -func MaxIPSetMembers() (int, error) { - return getValue(maxIPSetMembers) -} - // GetIPSetExecCount returns the number of observations for execution time of adding IPSets. // This function is slow. func GetIPSetExecCount() (int, error) { diff --git a/npm/metrics/ipsets_test.go b/npm/metrics/ipsets_test.go index 9c02efd6ca..2e6d54513f 100644 --- a/npm/metrics/ipsets_test.go +++ b/npm/metrics/ipsets_test.go @@ -22,6 +22,17 @@ type testSet struct { entryCount int } +func TestAddPod(t *testing.T) { + podIPTotal.Set(0) + AddPod() + AddPod() + AddPod() + RemovePod() + val, err := PodCount() + require.NoError(t, err, "unexpected error when getting pod count") + require.Equal(t, 2, val, "incorrect pod count") +} + func TestRecordIPSetExecTime(t *testing.T) { testStopAndRecord(t, setExecMetric) } diff --git a/npm/metrics/ipsets_windows_test.go b/npm/metrics/ipsets_windows_test.go deleted file mode 100644 index d51ef26a28..0000000000 --- a/npm/metrics/ipsets_windows_test.go +++ /dev/null @@ -1,95 +0,0 @@ -package metrics - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestMaxMembers(t *testing.T) { - ResetIPSetEntries() - // set1: 1, set2: 0 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 1) - // set1: 1, set2: 1 - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 1) - // set1: 2, set2: 1 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 2) - // set1: 2, set2: 2 - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 2) - // set1: 2, set2: 3 - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 3) - // set1: 1, set2: 3 - RemoveEntryFromIPSet("test-set1") - assertMaxMembers(t, 3) - // set1: 2, set2: 3 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 3) - // set1: 2, set2: 2 - RemoveEntryFromIPSet("test-set2") - assertMaxMembers(t, 2) - // set1: 2, set2: 1 - RemoveEntryFromIPSet("test-set2") - assertMaxMembers(t, 2) - // set1: 2, set2: 0 - RemoveEntryFromIPSet("test-set2") - assertMaxMembers(t, 2) - // set1: 1, set2: 0 - RemoveEntryFromIPSet("test-set1") - assertMaxMembers(t, 1) - // set1: 0, set2: 0 - RemoveEntryFromIPSet("test-set1") - assertMaxMembers(t, 0) - // set1: 0, set2: 0 - RemoveEntryFromIPSet("test-set2") - assertMaxMembers(t, 0) - // set1: 1 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 1) - // set1: 0 - ResetIPSetEntries() - assertMaxMembers(t, 0) - - AddEntryToIPSet("test-set1") - AddEntryToIPSet("test-set1") - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 2) - ResetIPSetEntries() - assertMaxMembers(t, 0) - - // set1: 1, set2: 0 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 1) - // set1: 1, set2: 1 - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 1) - // set1: 2, set2: 1 - AddEntryToIPSet("test-set1") - assertMaxMembers(t, 2) - // set1: 0, set2: 2 - RemoveAllEntriesFromIPSet("test-set1") - assertMaxMembers(t, 1) - // set1: 0, set2: 0 - RemoveAllEntriesFromIPSet("test-set2") - assertMaxMembers(t, 0) - - AddEntryToIPSet("test-set1") - AddEntryToIPSet("test-set1") - AddEntryToIPSet("test-set2") - AddEntryToIPSet("test-set2") - assertMaxMembers(t, 2) - RemoveAllEntriesFromIPSet("test-set1") - assertMaxMembers(t, 2) -} - -func assertMaxMembers(t *testing.T, expectedVal int) { - t.Helper() - - val, err := getValue(maxIPSetMembers) - require.Nil(t, err, "failed to get metric") - require.Equal(t, expectedVal, val, "incorrect max members") -} diff --git a/npm/metrics/prometheus-metrics.go b/npm/metrics/prometheus-metrics.go index b1b7142869..b82457ac95 100644 --- a/npm/metrics/prometheus-metrics.go +++ b/npm/metrics/prometheus-metrics.go @@ -113,7 +113,7 @@ var ( getNetworkFailures prometheus.Counter aclFailures *prometheus.CounterVec setPolicyFailures *prometheus.CounterVec - maxIPSetMembers prometheus.Gauge + podIPTotal prometheus.Gauge ) type RegistryType string @@ -152,6 +152,15 @@ func InitializeAll() { initializeDaemonMetrics() initializeControllerMetrics() + podIPTotal = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "pod_ip_total", + Subsystem: "", + Help: "Total number of Pod IPs across the cluster in Linux or Windows nodes", + }, + ) + if util.IsWindowsDP() { InitializeWindowsMetrics() @@ -167,7 +176,7 @@ func InitializeAll() { register(aclFailures, "acl_failure_total", NodeMetrics) register(setPolicyFailures, "setpolicy_failure_total", NodeMetrics) // all new metrics should go on the node metrics URL - register(maxIPSetMembers, "ipset_members_max", NodeMetrics) + register(podIPTotal, "ipset_members_max", NodeMetrics) } log.Logf("Finished initializing all Prometheus metrics") @@ -290,15 +299,6 @@ func InitializeWindowsMetrics() { }, []string{operationLabel, isNestedLabel}, ) - - maxIPSetMembers = prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "ipset_members_max", - Subsystem: windowsPrefix, - Help: "Maximum number of members in a single IPSet", - }, - ) } // GetHandler returns the HTTP handler for the metrics endpoint From 9102ca76e61e90a2071eab7d717ad6173d1d5ab0 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 5 Jun 2023 16:34:39 -0700 Subject: [PATCH 02/11] feat: update total pod IP metric in pod controller --- npm/pkg/controlplane/controllers/v2/podController.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/npm/pkg/controlplane/controllers/v2/podController.go b/npm/pkg/controlplane/controllers/v2/podController.go index 34764a8475..d3cdfecd01 100644 --- a/npm/pkg/controlplane/controllers/v2/podController.go +++ b/npm/pkg/controlplane/controllers/v2/podController.go @@ -369,6 +369,7 @@ func (c *PodController) syncAddedPod(podObj *corev1.Pod) error { // Create npmPod and add it to the podMap npmPodObj := common.NewNpmPod(podObj) c.podMap[podKey] = npmPodObj + metrics.AddPod() // Get lists of podLabelKey and podLabelKey + podLavelValue ,and then start adding them to ipsets. for labelKey, labelVal := range podObj.Labels { @@ -571,6 +572,7 @@ func (c *PodController) cleanUpDeletedPod(cachedNpmPodKey string) error { return fmt.Errorf("[cleanUpDeletedPod] Error: failed to delete pod from named port ipset with err: %w", err) } + metrics.RemovePod() delete(c.podMap, cachedNpmPodKey) return nil } From ae5b1dc86b94d6a880ea38c84a026371adb51177 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 5 Jun 2023 17:08:04 -0700 Subject: [PATCH 03/11] log: make sure we log apply dp errors --- .../controlplane/controllers/v2/namespaceController.go | 8 ++++++++ npm/pkg/controlplane/controllers/v2/podController.go | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/npm/pkg/controlplane/controllers/v2/namespaceController.go b/npm/pkg/controlplane/controllers/v2/namespaceController.go index d36d3a0b1b..ca85ea3958 100644 --- a/npm/pkg/controlplane/controllers/v2/namespaceController.go +++ b/npm/pkg/controlplane/controllers/v2/namespaceController.go @@ -245,6 +245,14 @@ func (nsc *NamespaceController) syncNamespace(nsKey string) error { metrics.RecordControllerNamespaceExecTime(timer, operationKind, err != nil && dperr != nil) if dperr != nil { + klog.Errorf("failed to apply dataplane changes while syncing namespace. err: %s", dperr.Error()) + metrics.SendErrorLogAndMetric(util.NSID, "[syncNamespace] failed to apply dataplane changes while syncing namespace. err: %s", dperr.Error()) + + // Seems like setting err below does nothing. + // The return value of syncNamespace is fixed before this deferred func is called + // so modifications to err here do nothing. + // As a result, the controller will not requeue if there is an error applying the dataplane. + // However, a subsequent controller event should Apply Dataplane soon after. if err == nil { err = fmt.Errorf("failed to apply dataplane changes while syncing namespace. err: %w", dperr) } else { diff --git a/npm/pkg/controlplane/controllers/v2/podController.go b/npm/pkg/controlplane/controllers/v2/podController.go index d3cdfecd01..47b4f0c296 100644 --- a/npm/pkg/controlplane/controllers/v2/podController.go +++ b/npm/pkg/controlplane/controllers/v2/podController.go @@ -273,6 +273,14 @@ func (c *PodController) syncPod(key string) error { metrics.RecordControllerPodExecTime(timer, operationKind, err != nil && dperr != nil) if dperr != nil { + klog.Errorf("failed to apply dataplane changes while syncing pod. err: %s", dperr.Error()) + metrics.SendErrorLogAndMetric(util.PodID, "[syncPod] failed to apply dataplane changes while syncing pod. err: %s", dperr.Error()) + + // Seems like setting err below does nothing. + // The return value of syncPod is fixed before this deferred func is called, + // so modifications to err here do nothing. + // As a result, the controller will not requeue if there is an error applying the dataplane. + // However, a subsequent controller event should Apply Dataplane soon after. if err == nil { err = fmt.Errorf("failed to apply dataplane changes while syncing pod. err: %w", dperr) } else { From 3e110a549b523cd60a12a2dfbe3a021e96c6b25f Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 5 Jun 2023 17:08:23 -0700 Subject: [PATCH 04/11] test: UTs for pod count metric --- .../controllers/v2/podController_test.go | 183 ++++++++++++++++-- 1 file changed, 171 insertions(+), 12 deletions(-) diff --git a/npm/pkg/controlplane/controllers/v2/podController_test.go b/npm/pkg/controlplane/controllers/v2/podController_test.go index d1ae9c5c02..19959c2a70 100644 --- a/npm/pkg/controlplane/controllers/v2/podController_test.go +++ b/npm/pkg/controlplane/controllers/v2/podController_test.go @@ -7,6 +7,7 @@ import ( "reflect" "strconv" "testing" + "time" "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/metrics/promutil" @@ -29,6 +30,8 @@ import ( const ( HostNetwork = true NonHostNetwork = false + + sleepDurationForRateLimiter = time.Duration(10 * time.Millisecond) ) // To indicate the object is needed to be DeletedFinalStateUnknown Object @@ -39,6 +42,11 @@ const ( DeletedFinalStateknownObject IsDeletedFinalStateUnknownObject = false ) +var ( + errControllerFake = fmt.Errorf("fake error in controller before applying DP") + errDPFake = fmt.Errorf("fake error when applying DP") +) + type podFixture struct { t *testing.T @@ -190,12 +198,20 @@ type expectedValues struct { } type podPromVals struct { + podCount int expectedAddExecCount int expectedUpdateExecCount int expectedDeleteExecCount int + addFailures int + updateFailures int + deleteFailures int } func (p *podPromVals) testPrometheusMetrics(t *testing.T) { + podCount, err := metrics.PodCount() + promutil.NotifyIfErrors(t, err) + require.Equal(t, p.podCount, podCount, "Count for pod count didn't register correctly in Prometheus") + addExecCount, err := metrics.GetControllerPodExecCount(metrics.CreateOp, false) promutil.NotifyIfErrors(t, err) require.Equal(t, p.expectedAddExecCount, addExecCount, "Count for add execution time didn't register correctly in Prometheus") @@ -309,8 +325,10 @@ func TestAddMultiplePods(t *testing.T) { addPod(t, f, podObj1) testCases := []expectedValues{ - {2, 1, 0, podPromVals{2, 0, 0}}, + {2, 1, 0, podPromVals{2, 2, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestAddMultiplePods", f, testCases) checkNpmPodWithInput("TestAddMultiplePods", f, podObj1) checkNpmPodWithInput("TestAddMultiplePods", f, podObj2) @@ -355,8 +373,10 @@ func TestAddPod(t *testing.T) { addPod(t, f, podObj) testCases := []expectedValues{ - {1, 1, 0, podPromVals{1, 0, 0}}, + {1, 1, 0, podPromVals{1, 1, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestAddPod", f, testCases) checkNpmPodWithInput("TestAddPod", f, podObj) } @@ -381,8 +401,10 @@ func TestAddHostNetworkPod(t *testing.T) { addPod(t, f, podObj) testCases := []expectedValues{ - {0, 0, 0, podPromVals{0, 0, 0}}, + {0, 0, 0, podPromVals{0, 0, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestAddHostNetworkPod", f, testCases) if _, exists := f.podController.podMap[podKey]; exists { @@ -441,15 +463,133 @@ func TestDeletePod(t *testing.T) { } deletePod(t, f, podObj, DeletedFinalStateknownObject) testCases := []expectedValues{ - {0, 1, 0, podPromVals{1, 0, 1}}, + {0, 1, 0, podPromVals{0, 1, 0, 1, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestDeletePod", f, testCases) if _, exists := f.podController.podMap[podKey]; exists { t.Error("TestDeletePod failed @ cached pod obj exists check") } } +func TestDeletePodControllerError(t *testing.T) { + labels := map[string]string{ + "app": "test-pod", + } + podObj := createPod("test-pod", "test-namespace", "0", "1.2.3.4", labels, NonHostNetwork, corev1.PodRunning) + podKey := getKey(podObj, t) + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + dp := dpmocks.NewMockGenericDataplane(ctrl) + f := newFixture(t, dp) + f.podLister = append(f.podLister, podObj) + f.kubeobjects = append(f.kubeobjects, podObj) + stopCh := make(chan struct{}) + defer close(stopCh) + f.newPodController(stopCh) + + // Add pod section + mockIPSets := []*ipsets.IPSetMetadata{ + ipsets.NewIPSetMetadata("test-namespace", ipsets.Namespace), + ipsets.NewIPSetMetadata("app", ipsets.KeyLabelOfPod), + ipsets.NewIPSetMetadata("app:test-pod", ipsets.KeyValueLabelOfPod), + } + podMetadata1 := dataplane.NewPodMetadata("test-namespace/test-pod", "1.2.3.4", "") + + dp.EXPECT().AddToLists([]*ipsets.IPSetMetadata{kubeAllNamespaces}, mockIPSets[:1]).Return(nil).Times(1) + dp.EXPECT().AddToSets(mockIPSets[:1], podMetadata1).Return(nil).Times(1) + dp.EXPECT().AddToSets(mockIPSets[1:], podMetadata1).Return(nil).Times(1) + if !util.IsWindowsDP() { + dp.EXPECT(). + AddToSets( + []*ipsets.IPSetMetadata{ipsets.NewIPSetMetadata("app:test-pod", ipsets.NamedPorts)}, + dataplane.NewPodMetadata("test-namespace/test-pod", "1.2.3.4,8080", ""), + ). + Return(nil).Times(1) + } + dp.EXPECT().ApplyDataPlane().Return(nil).Times(2) + // Delete pod section + dp.EXPECT().RemoveFromSets(mockIPSets[:1], podMetadata1).Return(nil).Times(1) + dp.EXPECT().RemoveFromSets(mockIPSets[1:], podMetadata1).Return(errControllerFake).Times(1) + deletePod(t, f, podObj, DeletedFinalStateknownObject) + testCases := []expectedValues{ + {1, 1, 1, podPromVals{1, 1, 0, 1, 0, 0, 1}}, + } + + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) + checkPodTestResult("TestDeletePodControllerError", f, testCases) + _, exists := f.podController.podMap[podKey] + require.True(t, exists) +} + +func TestDeletePodDPError(t *testing.T) { + labels := map[string]string{ + "app": "test-pod", + } + podObj := createPod("test-pod", "test-namespace", "0", "1.2.3.4", labels, NonHostNetwork, corev1.PodRunning) + podKey := getKey(podObj, t) + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + dp := dpmocks.NewMockGenericDataplane(ctrl) + f := newFixture(t, dp) + f.podLister = append(f.podLister, podObj) + f.kubeobjects = append(f.kubeobjects, podObj) + stopCh := make(chan struct{}) + defer close(stopCh) + f.newPodController(stopCh) + + // Add pod section + mockIPSets := []*ipsets.IPSetMetadata{ + ipsets.NewIPSetMetadata("test-namespace", ipsets.Namespace), + ipsets.NewIPSetMetadata("app", ipsets.KeyLabelOfPod), + ipsets.NewIPSetMetadata("app:test-pod", ipsets.KeyValueLabelOfPod), + } + podMetadata1 := dataplane.NewPodMetadata("test-namespace/test-pod", "1.2.3.4", "") + + dp.EXPECT().AddToLists([]*ipsets.IPSetMetadata{kubeAllNamespaces}, mockIPSets[:1]).Return(nil).Times(1) + dp.EXPECT().AddToSets(mockIPSets[:1], podMetadata1).Return(nil).Times(1) + dp.EXPECT().AddToSets(mockIPSets[1:], podMetadata1).Return(nil).Times(1) + if !util.IsWindowsDP() { + dp.EXPECT(). + AddToSets( + []*ipsets.IPSetMetadata{ipsets.NewIPSetMetadata("app:test-pod", ipsets.NamedPorts)}, + dataplane.NewPodMetadata("test-namespace/test-pod", "1.2.3.4,8080", ""), + ). + Return(nil).Times(1) + } + dp.EXPECT().ApplyDataPlane().Return(nil).Times(1) + dp.EXPECT().ApplyDataPlane().Return(errDPFake).Times(1) + // Delete pod section + dp.EXPECT().RemoveFromSets(mockIPSets[:1], podMetadata1).Return(nil).Times(1) + dp.EXPECT().RemoveFromSets(mockIPSets[1:], podMetadata1).Return(nil).Times(1) + if !util.IsWindowsDP() { + dp.EXPECT(). + RemoveFromSets( + []*ipsets.IPSetMetadata{ipsets.NewIPSetMetadata("app:test-pod", ipsets.NamedPorts)}, + dataplane.NewPodMetadata("test-namespace/test-pod", "1.2.3.4,8080", ""), + ). + Return(nil).Times(1) + } + deletePod(t, f, podObj, DeletedFinalStateknownObject) + testCases := []expectedValues{ + {0, 1, 0, podPromVals{0, 1, 0, 1, 0, 0, 0}}, + } + + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) + checkPodTestResult("TestDeletePodDPError", f, testCases) + if _, exists := f.podController.podMap[podKey]; exists { + t.Error("TestDeletePod failed @ cached pod obj exists check") + } +} + func TestDeleteHostNetworkPod(t *testing.T) { labels := map[string]string{ "app": "test-pod", @@ -470,8 +610,10 @@ func TestDeleteHostNetworkPod(t *testing.T) { deletePod(t, f, podObj, DeletedFinalStateknownObject) testCases := []expectedValues{ - {0, 0, 0, podPromVals{0, 0, 0}}, + {0, 0, 0, podPromVals{0, 0, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestDeleteHostNetworkPod", f, testCases) if _, exists := f.podController.podMap[podKey]; exists { t.Error("TestDeleteHostNetworkPod failed @ cached pod obj exists check") @@ -502,8 +644,10 @@ func TestDeletePodWithTombstone(t *testing.T) { f.podController.deletePod(tombstone) testCases := []expectedValues{ - {0, 0, 1, podPromVals{0, 0, 0}}, + {0, 0, 1, podPromVals{0, 0, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestDeletePodWithTombstone", f, testCases) } @@ -557,8 +701,10 @@ func TestDeletePodWithTombstoneAfterAddingPod(t *testing.T) { } deletePod(t, f, podObj, DeletedFinalStateUnknownObject) testCases := []expectedValues{ - {0, 1, 0, podPromVals{1, 0, 1}}, + {0, 1, 0, podPromVals{0, 1, 0, 1, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestDeletePodWithTombstoneAfterAddingPod", f, testCases) } @@ -614,8 +760,10 @@ func TestLabelUpdatePod(t *testing.T) { updatePod(t, f, oldPodObj, newPodObj) testCases := []expectedValues{ - {1, 1, 0, podPromVals{1, 1, 0}}, + {1, 1, 0, podPromVals{1, 1, 1, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestLabelUpdatePod", f, testCases) checkNpmPodWithInput("TestLabelUpdatePod", f, newPodObj) } @@ -678,9 +826,12 @@ func TestEmptyIPUpdate(t *testing.T) { updatePod(t, f, oldPodObj, newPodObj) + // pod is updated to be have no IP, so it should be cleaned up testCases := []expectedValues{ - {0, 1, 0, podPromVals{1, 1, 0}}, + {0, 1, 0, podPromVals{0, 1, 1, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestEmptyIPUpdate", f, testCases) } @@ -703,9 +854,12 @@ func TestEmptyIPAdd(t *testing.T) { f.newPodController(stopCh) addPod(t, f, podObj) + // since the new IP is invalid, adding the new Pod object is ignored testCases := []expectedValues{ - {0, 0, 0, podPromVals{0, 0, 0}}, + {0, 0, 0, podPromVals{0, 0, 0, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestEmptyIPAdd", f, testCases) if _, exists := f.podController.podMap[podKey]; exists { @@ -782,8 +936,10 @@ func TestIPAddressUpdatePod(t *testing.T) { updatePod(t, f, oldPodObj, newPodObj) testCases := []expectedValues{ - {1, 1, 0, podPromVals{1, 1, 0}}, + {1, 1, 0, podPromVals{1, 1, 1, 0, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestIPAddressUpdatePod", f, testCases) checkNpmPodWithInput("TestIPAddressUpdatePod", f, newPodObj) } @@ -843,9 +999,12 @@ func TestPodStatusUpdatePod(t *testing.T) { } updatePod(t, f, oldPodObj, newPodObj) + // pod is updated to be in Succeeded state, so it should be cleaned up testCases := []expectedValues{ - {0, 1, 0, podPromVals{1, 0, 1}}, + {0, 1, 0, podPromVals{0, 1, 0, 1, 0, 0, 0}}, } + // sleep in case rate limiter adds back to workqueue + time.Sleep(sleepDurationForRateLimiter) checkPodTestResult("TestPodStatusUpdatePod", f, testCases) if _, exists := f.podController.podMap[podKey]; exists { t.Error("TestPodStatusUpdatePod failed @ cached pod obj exists check") From df361fc94dcee20de82883e95f4af3f2054f4af0 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 5 Jun 2023 17:26:54 -0700 Subject: [PATCH 05/11] style: rename metric to customer_pods --- npm/metrics/ipsets.go | 6 +++--- npm/metrics/ipsets_test.go | 2 +- npm/metrics/prometheus-metrics.go | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/npm/metrics/ipsets.go b/npm/metrics/ipsets.go index 47526ddd30..f8cde1b822 100644 --- a/npm/metrics/ipsets.go +++ b/npm/metrics/ipsets.go @@ -9,18 +9,18 @@ var ipsetInventoryMap map[string]int // AddPod increments the number of Pod IPs. func AddPod() { - podIPTotal.Inc() + customerPodCount.Inc() } // RemovePod decrements the number of Pod IPs. func RemovePod() { - podIPTotal.Dec() + customerPodCount.Dec() } // PodCount gets the current number of Pod IPs. // This function is slow. func PodCount() (int, error) { - return getValue(podIPTotal) + return getValue(customerPodCount) } // IncNumIPSets increments the number of IPSets. diff --git a/npm/metrics/ipsets_test.go b/npm/metrics/ipsets_test.go index 2e6d54513f..61ad6484b1 100644 --- a/npm/metrics/ipsets_test.go +++ b/npm/metrics/ipsets_test.go @@ -23,7 +23,7 @@ type testSet struct { } func TestAddPod(t *testing.T) { - podIPTotal.Set(0) + customerPodCount.Set(0) AddPod() AddPod() AddPod() diff --git a/npm/metrics/prometheus-metrics.go b/npm/metrics/prometheus-metrics.go index b82457ac95..f58aa8b0b9 100644 --- a/npm/metrics/prometheus-metrics.go +++ b/npm/metrics/prometheus-metrics.go @@ -113,7 +113,7 @@ var ( getNetworkFailures prometheus.Counter aclFailures *prometheus.CounterVec setPolicyFailures *prometheus.CounterVec - podIPTotal prometheus.Gauge + customerPodCount prometheus.Gauge ) type RegistryType string @@ -152,12 +152,12 @@ func InitializeAll() { initializeDaemonMetrics() initializeControllerMetrics() - podIPTotal = prometheus.NewGauge( + customerPodCount = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: namespace, - Name: "pod_ip_total", + Name: "customer_pods", Subsystem: "", - Help: "Total number of Pod IPs across the cluster in Linux or Windows nodes", + Help: "Number of Pods NPM tracks across the cluster including Linux and Windows nodes", }, ) @@ -176,7 +176,7 @@ func InitializeAll() { register(aclFailures, "acl_failure_total", NodeMetrics) register(setPolicyFailures, "setpolicy_failure_total", NodeMetrics) // all new metrics should go on the node metrics URL - register(podIPTotal, "ipset_members_max", NodeMetrics) + register(customerPodCount, "ipset_members_max", NodeMetrics) } log.Logf("Finished initializing all Prometheus metrics") From 00804effd7740136e91c8fc5598cbeb1834d3288 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 6 Jun 2023 09:45:55 -0700 Subject: [PATCH 06/11] test: revert changes from previous PR to ipsetMgr windows UTs --- .../ipsets/ipsetmanager_windows_test.go | 275 +----------------- 1 file changed, 1 insertion(+), 274 deletions(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_windows_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_windows_test.go index bcfc19661e..959e36cf7e 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_windows_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_windows_test.go @@ -6,90 +6,12 @@ import ( "github.com/Azure/azure-container-networking/common" "github.com/Azure/azure-container-networking/network/hnswrapper" - "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/util" testutils "github.com/Azure/azure-container-networking/test/utils" "github.com/Microsoft/hcsshim/hcn" "github.com/stretchr/testify/require" ) -type winPromVals struct { - getNetworkLatencyCalls int - getNetworkFailures int - createIPSetLatencyCalls int - createIPSetFailures int - createNestedLatencyCalls int - createNestedFailures int - updateIPSetLatencyCalls int - updateIPSetFailures int - updateNestedLatencyCalls int - updateNestedFailures int - deleteIPSetLatencyCalls int - deleteIPSetFailures int - deleteNestedLatencyCalls int - deleteNestedFailures int -} - -func (w winPromVals) test(t *testing.T) { - t.Helper() - - count, err := metrics.TotalGetNetworkLatencyCalls() - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.getNetworkLatencyCalls, count, "incorrect list network latency calls") - - count, err = metrics.TotalGetNetworkFailures() - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.getNetworkFailures, count, "incorrect list network failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.CreateOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.createIPSetLatencyCalls, count, "incorrect create ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.CreateOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.createIPSetFailures, count, "incorrect create ipset failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.CreateOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.createNestedLatencyCalls, count, "incorrect create nested ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.CreateOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.createNestedFailures, count, "incorrect create nested ipset failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.UpdateOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.updateIPSetLatencyCalls, count, "incorrect update ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.UpdateOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.updateIPSetFailures, count, "incorrect update ipset failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.UpdateOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.updateNestedLatencyCalls, count, "incorrect update nested ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.UpdateOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.updateNestedFailures, count, "incorrect update nested ipset failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.DeleteOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.deleteIPSetLatencyCalls, count, "incorrect delete ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.DeleteOp, false) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.deleteIPSetFailures, count, "incorrect delete ipset failures") - - count, err = metrics.TotalSetPolicyLatencyCalls(metrics.DeleteOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.deleteNestedLatencyCalls, count, "incorrect delete nested ipset latency calls") - - count, err = metrics.TotalSetPolicyFailures(metrics.DeleteOp, true) - require.NoError(t, err, "failed to get metric") - require.Equal(t, w.deleteNestedFailures, count, "incorrect delete nested ipset failures") -} - func TestGetIPsFromSelectorIPSets(t *testing.T) { iMgr := NewIPSetManager(applyOnNeedCfg, common.NewMockIOShim([]testutils.TestCmd{})) setsTocreate := []*IPSetMetadata{ @@ -146,8 +68,6 @@ func TestGetIPsFromSelectorIPSets(t *testing.T) { } func TestAddToSetWindows(t *testing.T) { - metrics.InitializeWindowsMetrics() - hns := GetHNSFake(t, "azure") io := common.NewMockIOShimWithFakeHNS(hns) iMgr := NewIPSetManager(applyAlwaysCfg, io) @@ -172,46 +92,11 @@ func TestAddToSetWindows(t *testing.T) { err = iMgr.ApplyIPSets() require.NoError(t, err) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - createNestedLatencyCalls: 1, - }.test(t) } func TestDestroyNPMIPSets(t *testing.T) { - metrics.InitializeWindowsMetrics() - - hns := GetHNSFake(t, "azure") - io := common.NewMockIOShimWithFakeHNS(hns) - iMgr := NewIPSetManager(applyAlwaysCfg, io) - + iMgr := NewIPSetManager(applyAlwaysCfg, common.NewMockIOShim([]testutils.TestCmd{})) require.NoError(t, iMgr.resetIPSets()) - winPromVals{ - getNetworkLatencyCalls: 1, - }.test(t) - - setMetadata := NewIPSetMetadata(testSetName, Namespace) - iMgr.CreateIPSets([]*IPSetMetadata{setMetadata}) - - err := iMgr.AddToSets([]*IPSetMetadata{setMetadata}, testPodIP, testPodKey) - require.NoError(t, err) - - err = iMgr.ApplyIPSets() - require.NoError(t, err) - - winPromVals{ - getNetworkLatencyCalls: 2, - createIPSetLatencyCalls: 1, - }.test(t) - - require.NoError(t, iMgr.resetIPSets()) - winPromVals{ - getNetworkLatencyCalls: 3, - createIPSetLatencyCalls: 1, - deleteIPSetLatencyCalls: 1, - }.test(t) } // create all possible SetTypes @@ -297,8 +182,6 @@ func TestDestroyNPMIPSets(t *testing.T) { // } func TestApplyDeletions(t *testing.T) { - metrics.InitializeWindowsMetrics() - hns := GetHNSFake(t, "azure") io := common.NewMockIOShimWithFakeHNS(hns) iMgr := NewIPSetManager(applyAlwaysCfg, io) @@ -343,18 +226,10 @@ func TestApplyDeletions(t *testing.T) { require.NoError(t, err) verifyHNSCache(t, toAddOrUpdateSetMap, hns) verifyDeletedHNSCache(t, toDeleteSetNames, hns) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - createNestedLatencyCalls: 1, - }.test(t) } // TODO test that a reconcile list is updated func TestFailureOnCreation(t *testing.T) { - metrics.InitializeWindowsMetrics() - hns := GetHNSFake(t, "azure") io := common.NewMockIOShimWithFakeHNS(hns) iMgr := NewIPSetManager(applyAlwaysCfg, io) @@ -387,11 +262,6 @@ func TestFailureOnCreation(t *testing.T) { require.NoError(t, err) verifyHNSCache(t, toAddOrUpdateSetMap, hns) verifyDeletedHNSCache(t, toDeleteSetNames, hns) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - }.test(t) } // TODO test that a reconcile list is updated @@ -448,8 +318,6 @@ func TestFailureOnCreation(t *testing.T) { // TODO test that a reconcile list is updated func TestFailureOnFlush(t *testing.T) { - metrics.InitializeWindowsMetrics() - // This exact scenario wouldn't occur. This error happens when the cache is out of date with the kernel. hns := GetHNSFake(t, "azure") io := common.NewMockIOShimWithFakeHNS(hns) @@ -476,17 +344,10 @@ func TestFailureOnFlush(t *testing.T) { require.NoError(t, err) verifyHNSCache(t, toAddOrUpdateSetMap, hns) verifyDeletedHNSCache(t, toDeleteSetNames, hns) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - }.test(t) } // TODO test that a reconcile list is updated func TestFailureOnDeletion(t *testing.T) { - metrics.InitializeWindowsMetrics() - hns := GetHNSFake(t, "azure") io := common.NewMockIOShimWithFakeHNS(hns) iMgr := NewIPSetManager(applyAlwaysCfg, io) @@ -512,140 +373,6 @@ func TestFailureOnDeletion(t *testing.T) { require.NoError(t, err) verifyHNSCache(t, toAddOrUpdateSetMap, hns) verifyDeletedHNSCache(t, toDeleteSetNames, hns) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - }.test(t) -} - -func TestMetrics(t *testing.T) { - metrics.InitializeWindowsMetrics() - metrics.ResetIPSetEntries() - - hns := GetHNSFake(t, "azure") - io := common.NewMockIOShimWithFakeHNS(hns) - iMgr := NewIPSetManager(applyAlwaysCfg, io) - - nsW := NewIPSetMetadata("w", Namespace) - nsX := NewIPSetMetadata("x", Namespace) - nsY := NewIPSetMetadata("y", Namespace) - nsZ := NewIPSetMetadata("z", Namespace) - nsList1 := NewIPSetMetadata("list1", KeyLabelOfNamespace) - nsList2 := NewIPSetMetadata("list2", KeyLabelOfNamespace) - nsList3 := NewIPSetMetadata("list3", KeyLabelOfNamespace) - nsList4 := NewIPSetMetadata("list4", KeyLabelOfNamespace) - - iMgr.CreateIPSets([]*IPSetMetadata{nsW, nsX, nsY, nsList1, nsList2, nsList3}) - require.NoError(t, iMgr.AddToSets([]*IPSetMetadata{nsW, nsX}, "1.2.3.4", "pod1")) - require.NoError(t, iMgr.AddToLists([]*IPSetMetadata{nsList1, nsList2}, []*IPSetMetadata{nsW})) - require.Nil(t, iMgr.ApplyIPSets()) - - winPromVals{ - getNetworkLatencyCalls: 1, - createIPSetLatencyCalls: 1, - createNestedLatencyCalls: 1, - }.test(t) - - iMgr.CreateIPSets([]*IPSetMetadata{nsZ, nsList4}) - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsY}, "5.6.7.8", "pod2")) - require.Nil(t, iMgr.AddToLists([]*IPSetMetadata{nsList3}, []*IPSetMetadata{nsX})) - require.Nil(t, iMgr.RemoveFromSets([]*IPSetMetadata{nsW, nsX}, "1.2.3.4", "pod1")) - require.Nil(t, iMgr.RemoveFromList(nsList1, []*IPSetMetadata{nsW})) - require.Nil(t, iMgr.RemoveFromList(nsList2, []*IPSetMetadata{nsW})) - iMgr.DeleteIPSet(nsW.GetPrefixName(), util.SoftDelete) - iMgr.DeleteIPSet(nsList1.GetPrefixName(), util.SoftDelete) - require.Nil(t, iMgr.ApplyIPSets()) - - winPromVals{ - getNetworkLatencyCalls: 2, - createIPSetLatencyCalls: 2, - createNestedLatencyCalls: 2, - updateIPSetLatencyCalls: 1, - updateNestedLatencyCalls: 1, - deleteIPSetLatencyCalls: 1, - deleteNestedLatencyCalls: 1, - }.test(t) -} - -func TestMetricsMaxMember(t *testing.T) { - metrics.InitializeWindowsMetrics() - metrics.ResetIPSetEntries() - - hns := GetHNSFake(t, "azure") - io := common.NewMockIOShimWithFakeHNS(hns) - iMgr := NewIPSetManager(applyAlwaysCfg, io) - - nsW := NewIPSetMetadata("w", Namespace) - nsX := NewIPSetMetadata("x", Namespace) - nsY := NewIPSetMetadata("y", Namespace) - nsList1 := NewIPSetMetadata("list1", KeyLabelOfNamespace) - - iMgr.CreateIPSets([]*IPSetMetadata{nsW, nsX, nsY, nsList1}) - count, err := metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 0, count) - - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsW, nsX}, "1.1.1.1", "pod1")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 1, count) - - require.Nil(t, iMgr.AddToLists([]*IPSetMetadata{nsList1}, []*IPSetMetadata{nsW})) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 1, count) - - require.Nil(t, iMgr.AddToLists([]*IPSetMetadata{nsList1}, []*IPSetMetadata{nsX, nsY})) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsX}, "2.2.2.2", "pod2")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - require.Nil(t, iMgr.RemoveFromList(nsList1, []*IPSetMetadata{nsX, nsY})) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 2, count) - - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsX}, "3.3.3.3", "pod3")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsX}, "3.3.3.3", "pod3-changed")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - require.Nil(t, iMgr.AddToSets([]*IPSetMetadata{nsX}, "4.4.4.4", "pod4")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 4, count) - - require.Nil(t, iMgr.RemoveFromSets([]*IPSetMetadata{nsX}, "4.4.4.4", "pod4")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - require.Nil(t, iMgr.RemoveFromSets([]*IPSetMetadata{nsX}, "3.3.3.3", "wrong-pod")) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - // can't delete this set - iMgr.DeleteIPSet(nsX.GetPrefixName(), util.SoftDelete) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 3, count) - - iMgr.DeleteIPSet(nsX.GetPrefixName(), util.ForceDelete) - count, err = metrics.MaxIPSetMembers() - require.Nil(t, err, "failed to get metric") - require.Equal(t, 1, count) } func verifyHNSCache(t *testing.T, expected map[string]hcn.SetPolicySetting, hns *hnswrapper.Hnsv2wrapperFake) { From 85d66350ac30a59271288140f69d65b37f6838c4 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 6 Jun 2023 15:29:38 -0700 Subject: [PATCH 07/11] style: rename metric to pods_watched --- npm/metrics/ipsets.go | 6 +++--- npm/metrics/ipsets_test.go | 2 +- npm/metrics/prometheus-metrics.go | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/npm/metrics/ipsets.go b/npm/metrics/ipsets.go index f8cde1b822..a20b711b65 100644 --- a/npm/metrics/ipsets.go +++ b/npm/metrics/ipsets.go @@ -9,18 +9,18 @@ var ipsetInventoryMap map[string]int // AddPod increments the number of Pod IPs. func AddPod() { - customerPodCount.Inc() + podsWatched.Inc() } // RemovePod decrements the number of Pod IPs. func RemovePod() { - customerPodCount.Dec() + podsWatched.Dec() } // PodCount gets the current number of Pod IPs. // This function is slow. func PodCount() (int, error) { - return getValue(customerPodCount) + return getValue(podsWatched) } // IncNumIPSets increments the number of IPSets. diff --git a/npm/metrics/ipsets_test.go b/npm/metrics/ipsets_test.go index 61ad6484b1..99a20305a2 100644 --- a/npm/metrics/ipsets_test.go +++ b/npm/metrics/ipsets_test.go @@ -23,7 +23,7 @@ type testSet struct { } func TestAddPod(t *testing.T) { - customerPodCount.Set(0) + podsWatched.Set(0) AddPod() AddPod() AddPod() diff --git a/npm/metrics/prometheus-metrics.go b/npm/metrics/prometheus-metrics.go index f58aa8b0b9..165eaec411 100644 --- a/npm/metrics/prometheus-metrics.go +++ b/npm/metrics/prometheus-metrics.go @@ -113,7 +113,7 @@ var ( getNetworkFailures prometheus.Counter aclFailures *prometheus.CounterVec setPolicyFailures *prometheus.CounterVec - customerPodCount prometheus.Gauge + podsWatched prometheus.Gauge ) type RegistryType string @@ -152,14 +152,15 @@ func InitializeAll() { initializeDaemonMetrics() initializeControllerMetrics() - customerPodCount = prometheus.NewGauge( + podsWatched = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: namespace, - Name: "customer_pods", + Name: "pods_watched", Subsystem: "", Help: "Number of Pods NPM tracks across the cluster including Linux and Windows nodes", }, ) + register(podsWatched, "pods_watched", ClusterMetrics) if util.IsWindowsDP() { InitializeWindowsMetrics() @@ -175,8 +176,6 @@ func InitializeAll() { register(getNetworkFailures, "get_network_failure_total", NodeMetrics) register(aclFailures, "acl_failure_total", NodeMetrics) register(setPolicyFailures, "setpolicy_failure_total", NodeMetrics) - // all new metrics should go on the node metrics URL - register(customerPodCount, "ipset_members_max", NodeMetrics) } log.Logf("Finished initializing all Prometheus metrics") @@ -344,6 +343,8 @@ func register(collector prometheus.Collector, name string, registryType Registry err := getRegistry(registryType).Register(collector) if err != nil { log.Errorf("Error creating metric %s", name) + } else { + klog.Infof("registered metric %s to registry %s", name, registryType) } } From f21bf38d98333fd76a5df53bcfa15daf9aa0b2fa Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 6 Jun 2023 15:29:47 -0700 Subject: [PATCH 08/11] test: try longer wait --- npm/pkg/controlplane/controllers/v2/podController_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/npm/pkg/controlplane/controllers/v2/podController_test.go b/npm/pkg/controlplane/controllers/v2/podController_test.go index 19959c2a70..4cd31e8b12 100644 --- a/npm/pkg/controlplane/controllers/v2/podController_test.go +++ b/npm/pkg/controlplane/controllers/v2/podController_test.go @@ -31,7 +31,7 @@ const ( HostNetwork = true NonHostNetwork = false - sleepDurationForRateLimiter = time.Duration(10 * time.Millisecond) + sleepDurationForRateLimiter = time.Duration(100 * time.Millisecond) ) // To indicate the object is needed to be DeletedFinalStateUnknown Object From 71529a8643c44282c301c9277f82a3437e3b1e4b Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 7 Jun 2023 10:21:07 -0700 Subject: [PATCH 09/11] debug: tmp log --- npm/pkg/controlplane/controllers/v2/podController.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/npm/pkg/controlplane/controllers/v2/podController.go b/npm/pkg/controlplane/controllers/v2/podController.go index 47b4f0c296..c9d9ed4d86 100644 --- a/npm/pkg/controlplane/controllers/v2/podController.go +++ b/npm/pkg/controlplane/controllers/v2/podController.go @@ -378,6 +378,7 @@ func (c *PodController) syncAddedPod(podObj *corev1.Pod) error { npmPodObj := common.NewNpmPod(podObj) c.podMap[podKey] = npmPodObj metrics.AddPod() + klog.Infof("DEBUGME: ADD pod for %s. npmPodObj: %+v", podKey, npmPodObj) // Get lists of podLabelKey and podLabelKey + podLavelValue ,and then start adding them to ipsets. for labelKey, labelVal := range podObj.Labels { @@ -435,6 +436,9 @@ func (c *PodController) syncAddAndUpdatePod(newPodObj *corev1.Pod) (metrics.Oper if !exists { return metrics.CreateOp, c.syncAddedPod(newPodObj) } + + klog.Infof("DEBUGME: UPDATE pod for %s. npmPodObj: %+v. cached obj: %+v", podKey, newPodObj, cachedNpmPod) + // now we know this is an update event, and we'll return metrics.UpdateOp // Dealing with "updatePod" event - Compare last applied states against current Pod states @@ -581,6 +585,7 @@ func (c *PodController) cleanUpDeletedPod(cachedNpmPodKey string) error { } metrics.RemovePod() + klog.Infof("DEBUGME: REMOVE pod for %s. npmPodObj: %+v", cachedNpmPodKey, cachedNpmPod) delete(c.podMap, cachedNpmPodKey) return nil } From 9dc1195c1286b9604d30cf46f62a1fd25fce47d8 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 7 Jun 2023 11:34:35 -0700 Subject: [PATCH 10/11] Revert "debug: tmp log" This reverts commit 71529a8643c44282c301c9277f82a3437e3b1e4b. --- npm/pkg/controlplane/controllers/v2/podController.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/npm/pkg/controlplane/controllers/v2/podController.go b/npm/pkg/controlplane/controllers/v2/podController.go index c9d9ed4d86..47b4f0c296 100644 --- a/npm/pkg/controlplane/controllers/v2/podController.go +++ b/npm/pkg/controlplane/controllers/v2/podController.go @@ -378,7 +378,6 @@ func (c *PodController) syncAddedPod(podObj *corev1.Pod) error { npmPodObj := common.NewNpmPod(podObj) c.podMap[podKey] = npmPodObj metrics.AddPod() - klog.Infof("DEBUGME: ADD pod for %s. npmPodObj: %+v", podKey, npmPodObj) // Get lists of podLabelKey and podLabelKey + podLavelValue ,and then start adding them to ipsets. for labelKey, labelVal := range podObj.Labels { @@ -436,9 +435,6 @@ func (c *PodController) syncAddAndUpdatePod(newPodObj *corev1.Pod) (metrics.Oper if !exists { return metrics.CreateOp, c.syncAddedPod(newPodObj) } - - klog.Infof("DEBUGME: UPDATE pod for %s. npmPodObj: %+v. cached obj: %+v", podKey, newPodObj, cachedNpmPod) - // now we know this is an update event, and we'll return metrics.UpdateOp // Dealing with "updatePod" event - Compare last applied states against current Pod states @@ -585,7 +581,6 @@ func (c *PodController) cleanUpDeletedPod(cachedNpmPodKey string) error { } metrics.RemovePod() - klog.Infof("DEBUGME: REMOVE pod for %s. npmPodObj: %+v", cachedNpmPodKey, cachedNpmPod) delete(c.podMap, cachedNpmPodKey) return nil } From 57887398be0a582681d94bb5f8c8d0512be72a12 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 7 Jun 2023 12:07:05 -0700 Subject: [PATCH 11/11] style: fix whitespace --- npm/metrics/prometheus-metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/npm/metrics/prometheus-metrics.go b/npm/metrics/prometheus-metrics.go index 165eaec411..7ed5f9b2e4 100644 --- a/npm/metrics/prometheus-metrics.go +++ b/npm/metrics/prometheus-metrics.go @@ -113,7 +113,7 @@ var ( getNetworkFailures prometheus.Counter aclFailures *prometheus.CounterVec setPolicyFailures *prometheus.CounterVec - podsWatched prometheus.Gauge + podsWatched prometheus.Gauge ) type RegistryType string