From 5d813100caea748f564fd94f1b45231977cb5ff5 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Mon, 10 Oct 2022 19:25:46 +0000 Subject: [PATCH 1/2] emit a metric for NC sync and an error if all NCs are not present Signed-off-by: Evan Baker --- cns/restserver/internalapi.go | 26 +++++++++++++++++++++----- cns/restserver/metrics.go | 17 +++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/cns/restserver/internalapi.go b/cns/restserver/internalapi.go index 6c502f994c..5769b83ce8 100644 --- a/cns/restserver/internalapi.go +++ b/cns/restserver/internalapi.go @@ -157,13 +157,14 @@ func (service *HTTPRestService) SyncHostNCVersion(ctx context.Context, channelMo if err != nil { logger.Errorf("sync host error %v", err) } - syncHostNCVersion.WithLabelValues(strconv.FormatBool(err == nil)).Observe(time.Since(start).Seconds()) + syncHostNCVersionCount.WithLabelValues(strconv.FormatBool(err == nil)).Inc() + syncHostNCVersionLatency.WithLabelValues(strconv.FormatBool(err == nil)).Observe(time.Since(start).Seconds()) } var errNonExistentContainerStatus = errors.New("nonExistantContainerstatus") func (service *HTTPRestService) syncHostNCVersion(ctx context.Context, channelMode string) error { - var hostVersionNeedsUpdateContainers []string + hostVersionNeedsUpdateContainers := map[string]struct{}{} for idx := range service.state.ContainerStatus { // Will open a separate PR to convert all the NC version related variable to int. Change from string to int is a pain. hostVersion, err := strconv.Atoi(service.state.ContainerStatus[idx].HostVersion) @@ -178,7 +179,7 @@ func (service *HTTPRestService) syncHostNCVersion(ctx context.Context, channelMo } // host NC version is the NC version from NMAgent, if it's smaller than NC version from DNC, then append it to indicate it needs update. if hostVersion < dncNcVersion { - hostVersionNeedsUpdateContainers = append(hostVersionNeedsUpdateContainers, service.state.ContainerStatus[idx].ID) + hostVersionNeedsUpdateContainers[service.state.ContainerStatus[idx].ID] = struct{}{} } else if hostVersion > dncNcVersion { logger.Errorf("NC version from NMAgent is larger than DNC, NC version from NMAgent is %d, NC version from DNC is %d", hostVersion, dncNcVersion) } @@ -195,21 +196,29 @@ func (service *HTTPRestService) syncHostNCVersion(ctx context.Context, channelMo for _, nc := range ncList.Containers { newHostNCVersionList[nc.NetworkContainerID] = nc.Version } - for _, ncID := range hostVersionNeedsUpdateContainers { + for ncID := range hostVersionNeedsUpdateContainers { versionStr, ok := newHostNCVersionList[ncID] if !ok { + // NMA doesn't have this NC that we need programmed yet, bail out continue } version, err := strconv.Atoi(versionStr) if err != nil { return errors.Wrapf(err, "failed to parse container version of %s", ncID) } - // Check whether it exist in service state and get the related nc info ncInfo, exist := service.state.ContainerStatus[ncID] if !exist { return errors.Wrapf(errNonExistentContainerStatus, "can't find NC with ID %s in service state, stop updating this host NC version", ncID) } + hostVersion, err := strconv.Atoi(ncInfo.HostVersion) + if err != nil { + return errors.Wrapf(err, "failed to parse host nc version string %s", ncInfo.HostVersion) + } + if hostVersion > version { + logger.Errorf("NC version from NMA is decreasing: have %d, got %d", hostVersion, version) + continue + } if channelMode == cns.CRD { service.MarkIpsAsAvailableUntransacted(ncInfo.ID, version) } @@ -217,6 +226,13 @@ func (service *HTTPRestService) syncHostNCVersion(ctx context.Context, channelMo ncInfo.HostVersion = versionStr service.state.ContainerStatus[ncID] = ncInfo logger.Printf("Updated NC %s host version from %s to %s", ncID, oldHostNCVersion, ncInfo.HostVersion) + // if we successfully updated the NC, pop it from the needs update set. + delete(hostVersionNeedsUpdateContainers, ncID) + } + // if we didn't empty out the needs update set, NMA has not programmed all the NCs we are expecting, and we + // need to return an error indicating that + if len(hostVersionNeedsUpdateContainers) > 0 { + return errors.Errorf("unabled to update some NCs: %v, missing or bad response from NMA", hostVersionNeedsUpdateContainers) } return nil } diff --git a/cns/restserver/metrics.go b/cns/restserver/metrics.go index 78e9fedcf6..a9ba967214 100644 --- a/cns/restserver/metrics.go +++ b/cns/restserver/metrics.go @@ -39,14 +39,22 @@ var ipConfigStatusStateTransitionTime = prometheus.NewHistogramVec( []string{"previous_state", "next_state"}, ) -var syncHostNCVersion = prometheus.NewHistogramVec( +var syncHostNCVersionCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "sync_host_nc_version_total", + Help: "Count of Sync Host NC by success or failure", + }, + []string{"ok"}, +) + +var syncHostNCVersionLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "sync_host_nc_version_seconds", + Name: "sync_host_nc_version_latency_seconds", Help: "Sync Host NC Latency", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), // 1 ms to ~16 seconds }, - []string{"success"}, + []string{"ok"}, ) func init() { @@ -54,7 +62,8 @@ func init() { httpRequestLatency, ipAssignmentLatency, ipConfigStatusStateTransitionTime, - syncHostNCVersion, + syncHostNCVersionCount, + syncHostNCVersionLatency, ) } From 8601ceb3c6119ca5ced7929c50aa05a92e5cf9b2 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Wed, 12 Oct 2022 17:20:27 +0000 Subject: [PATCH 2/2] rename variables in nc sync host version for clarity Signed-off-by: Evan Baker --- cns/restserver/internalapi.go | 58 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/cns/restserver/internalapi.go b/cns/restserver/internalapi.go index 5769b83ce8..0f6f7f08fc 100644 --- a/cns/restserver/internalapi.go +++ b/cns/restserver/internalapi.go @@ -164,75 +164,79 @@ func (service *HTTPRestService) SyncHostNCVersion(ctx context.Context, channelMo var errNonExistentContainerStatus = errors.New("nonExistantContainerstatus") func (service *HTTPRestService) syncHostNCVersion(ctx context.Context, channelMode string) error { - hostVersionNeedsUpdateContainers := map[string]struct{}{} + outdatedNCs := map[string]struct{}{} for idx := range service.state.ContainerStatus { // Will open a separate PR to convert all the NC version related variable to int. Change from string to int is a pain. - hostVersion, err := strconv.Atoi(service.state.ContainerStatus[idx].HostVersion) + localNCVersion, err := strconv.Atoi(service.state.ContainerStatus[idx].HostVersion) if err != nil { logger.Errorf("Received err when change containerstatus.HostVersion %s to int, err msg %v", service.state.ContainerStatus[idx].HostVersion, err) continue } - dncNcVersion, err := strconv.Atoi(service.state.ContainerStatus[idx].CreateNetworkContainerRequest.Version) + dncNCVersion, err := strconv.Atoi(service.state.ContainerStatus[idx].CreateNetworkContainerRequest.Version) if err != nil { logger.Errorf("Received err when change nc version %s in containerstatus to int, err msg %v", service.state.ContainerStatus[idx].CreateNetworkContainerRequest.Version, err) continue } // host NC version is the NC version from NMAgent, if it's smaller than NC version from DNC, then append it to indicate it needs update. - if hostVersion < dncNcVersion { - hostVersionNeedsUpdateContainers[service.state.ContainerStatus[idx].ID] = struct{}{} - } else if hostVersion > dncNcVersion { - logger.Errorf("NC version from NMAgent is larger than DNC, NC version from NMAgent is %d, NC version from DNC is %d", hostVersion, dncNcVersion) + if localNCVersion < dncNCVersion { + outdatedNCs[service.state.ContainerStatus[idx].ID] = struct{}{} + } else if localNCVersion > dncNCVersion { + logger.Errorf("NC version from NMAgent is larger than DNC, NC version from NMAgent is %d, NC version from DNC is %d", localNCVersion, dncNCVersion) } } - if len(hostVersionNeedsUpdateContainers) == 0 { + if len(outdatedNCs) == 0 { return nil } - ncList, err := service.nmagentClient.GetNCVersionList(ctx) + ncVersionListResp, err := service.nmagentClient.GetNCVersionList(ctx) if err != nil { return errors.Wrap(err, "failed to get nc version list from nmagent") } - newHostNCVersionList := map[string]string{} - for _, nc := range ncList.Containers { - newHostNCVersionList[nc.NetworkContainerID] = nc.Version + nmaNCs := map[string]string{} + for _, nc := range ncVersionListResp.Containers { + nmaNCs[nc.NetworkContainerID] = nc.Version } - for ncID := range hostVersionNeedsUpdateContainers { - versionStr, ok := newHostNCVersionList[ncID] + for ncID := range outdatedNCs { + nmaNCVersionStr, ok := nmaNCs[ncID] if !ok { // NMA doesn't have this NC that we need programmed yet, bail out continue } - version, err := strconv.Atoi(versionStr) + nmaNCVersion, err := strconv.Atoi(nmaNCVersionStr) if err != nil { - return errors.Wrapf(err, "failed to parse container version of %s", ncID) + logger.Errorf("failed to parse container version of %s: %s", ncID, err) + continue } // Check whether it exist in service state and get the related nc info ncInfo, exist := service.state.ContainerStatus[ncID] if !exist { + // if we marked this NC as needs update, but it no longer exists in internal state when we reach + // this point, our internal state has changed unexpectedly and we should bail out and try again. return errors.Wrapf(errNonExistentContainerStatus, "can't find NC with ID %s in service state, stop updating this host NC version", ncID) } - hostVersion, err := strconv.Atoi(ncInfo.HostVersion) + localNCVersion, err := strconv.Atoi(ncInfo.HostVersion) if err != nil { - return errors.Wrapf(err, "failed to parse host nc version string %s", ncInfo.HostVersion) + logger.Errorf("failed to parse host nc version string %s: %s", ncInfo.HostVersion, err) + continue } - if hostVersion > version { - logger.Errorf("NC version from NMA is decreasing: have %d, got %d", hostVersion, version) + if localNCVersion > nmaNCVersion { + logger.Errorf("NC version from NMA is decreasing: have %d, got %d", localNCVersion, nmaNCVersion) continue } if channelMode == cns.CRD { - service.MarkIpsAsAvailableUntransacted(ncInfo.ID, version) + service.MarkIpsAsAvailableUntransacted(ncInfo.ID, nmaNCVersion) } - oldHostNCVersion := ncInfo.HostVersion - ncInfo.HostVersion = versionStr + logger.Printf("Updating NC %s host version from %s to %s", ncID, ncInfo.HostVersion, nmaNCVersionStr) + ncInfo.HostVersion = nmaNCVersionStr + logger.Printf("Updated NC %s host version to %s", ncID, ncInfo.HostVersion) service.state.ContainerStatus[ncID] = ncInfo - logger.Printf("Updated NC %s host version from %s to %s", ncID, oldHostNCVersion, ncInfo.HostVersion) // if we successfully updated the NC, pop it from the needs update set. - delete(hostVersionNeedsUpdateContainers, ncID) + delete(outdatedNCs, ncID) } // if we didn't empty out the needs update set, NMA has not programmed all the NCs we are expecting, and we // need to return an error indicating that - if len(hostVersionNeedsUpdateContainers) > 0 { - return errors.Errorf("unabled to update some NCs: %v, missing or bad response from NMA", hostVersionNeedsUpdateContainers) + if len(outdatedNCs) > 0 { + return errors.Errorf("unabled to update some NCs: %v, missing or bad response from NMA", outdatedNCs) } return nil }