From c542b855ac5e6be7230f8640bde2c95fcdcd3b4e Mon Sep 17 00:00:00 2001 From: behzadm Date: Fri, 10 Feb 2023 13:52:57 -0800 Subject: [PATCH 1/3] Adding a defer func to connecttoTelemetryservice() to prevent CNI from stucking in case of telemetry service failure. --- telemetry/telemetrybuffer.go | 48 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/telemetry/telemetrybuffer.go b/telemetry/telemetrybuffer.go index 69be6fb29b..b9181fb234 100644 --- a/telemetry/telemetrybuffer.go +++ b/telemetry/telemetrybuffer.go @@ -328,32 +328,42 @@ func (tb *TelemetryBuffer) ConnectCNIToTelemetryService(telemetryNumRetries, tel path, dir := getTelemetryServiceDirectory() args := []string{"-d", dir} for attempt := 0; attempt < 2; attempt++ { - if err := tb.Connect(); err != nil { - log.Logf("Connection to telemetry socket failed: %v", err) - if runtime.GOOS == "windows" { - if err = netPlugin.LockKeyValueStore(); err != nil { - log.Logf("lock acquire error: %v", err) - return errors.Wrap(err, "lock acquire error") - } - } - if err = tb.Cleanup(FdName); err != nil { - return errors.Wrap(err, "cleanup failed") - } - if err = StartTelemetryService(path, args); err != nil { - return errors.Wrap(err, "StartTelemetryService failed") + if err := tb.ConnectCNIToTelemetryServiceAttempt(telemetryNumRetries, telemetryWaitTimeInMilliseconds, netPlugin, path, args); err != nil { + return errors.Wrap(err, "lock acquire error") + } + } + return nil +} + +// This function is getting called from ConnectCNIToTelemetryService() in each attempt inside for loop +// This function has been created to be able to add defer within the for loop +func (tb *TelemetryBuffer) ConnectCNIToTelemetryServiceAttempt(telemetryNumRetries, telemetryWaitTimeInMilliseconds int, netPlugin *cni.Plugin, path string, args []string) error { + if err := tb.Connect(); err != nil { + log.Logf("Connection to telemetry socket failed: %v", err) + if runtime.GOOS == "windows" { + if err = netPlugin.LockKeyValueStore(); err != nil { + log.Logf("lock acquire error: %v", err) + return errors.Wrap(err, "lock acquire error") } - WaitForTelemetrySocket(telemetryNumRetries, time.Duration(telemetryWaitTimeInMilliseconds)) + } + defer func() { if runtime.GOOS == "windows" { if err = netPlugin.UnLockKeyValueStore(); err != nil { log.Logf("failed to relinquish lock error: %v", err) - return errors.Wrap(err, "failed to relinquish lock error") } } - } else { - tb.Connected = true - log.Logf("Connected to telemetry service") - return nil + }() + if err = tb.Cleanup(FdName); err != nil { + return errors.Wrap(err, "cleanup failed") + } + if err = StartTelemetryService(path, args); err != nil { + return errors.Wrap(err, "StartTelemetryService failed") } + WaitForTelemetrySocket(telemetryNumRetries, time.Duration(telemetryWaitTimeInMilliseconds)) + } else { + tb.Connected = true + log.Logf("Connected to telemetry service") + return nil } return nil } From 795b0d5c4a03aea63bcf88a49539a39853d8ca05 Mon Sep 17 00:00:00 2001 From: behzadm Date: Fri, 10 Feb 2023 14:51:34 -0800 Subject: [PATCH 2/3] fix: addressing the comments for telemetry defer function. --- telemetry/telemetrybuffer.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/telemetry/telemetrybuffer.go b/telemetry/telemetrybuffer.go index b9181fb234..6824fb6f56 100644 --- a/telemetry/telemetrybuffer.go +++ b/telemetry/telemetrybuffer.go @@ -328,7 +328,7 @@ func (tb *TelemetryBuffer) ConnectCNIToTelemetryService(telemetryNumRetries, tel path, dir := getTelemetryServiceDirectory() args := []string{"-d", dir} for attempt := 0; attempt < 2; attempt++ { - if err := tb.ConnectCNIToTelemetryServiceAttempt(telemetryNumRetries, telemetryWaitTimeInMilliseconds, netPlugin, path, args); err != nil { + if err := tb.startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds, netPlugin, path, args); err != nil { return errors.Wrap(err, "lock acquire error") } } @@ -337,7 +337,7 @@ func (tb *TelemetryBuffer) ConnectCNIToTelemetryService(telemetryNumRetries, tel // This function is getting called from ConnectCNIToTelemetryService() in each attempt inside for loop // This function has been created to be able to add defer within the for loop -func (tb *TelemetryBuffer) ConnectCNIToTelemetryServiceAttempt(telemetryNumRetries, telemetryWaitTimeInMilliseconds int, netPlugin *cni.Plugin, path string, args []string) error { +func (tb *TelemetryBuffer) startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds int, netPlugin *cni.Plugin, path string, args []string) error { if err := tb.Connect(); err != nil { log.Logf("Connection to telemetry socket failed: %v", err) if runtime.GOOS == "windows" { @@ -345,14 +345,12 @@ func (tb *TelemetryBuffer) ConnectCNIToTelemetryServiceAttempt(telemetryNumRetri log.Logf("lock acquire error: %v", err) return errors.Wrap(err, "lock acquire error") } - } - defer func() { - if runtime.GOOS == "windows" { + defer func() { if err = netPlugin.UnLockKeyValueStore(); err != nil { log.Logf("failed to relinquish lock error: %v", err) } - } - }() + }() + } if err = tb.Cleanup(FdName); err != nil { return errors.Wrap(err, "cleanup failed") } From 606431f8b8b66b3d5177e243dd19e23937bbbe53 Mon Sep 17 00:00:00 2001 From: behzadm Date: Fri, 10 Feb 2023 15:08:52 -0800 Subject: [PATCH 3/3] fix: addressing the comments for telemetry defer func. --- telemetry/telemetrybuffer.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/telemetry/telemetrybuffer.go b/telemetry/telemetrybuffer.go index 6824fb6f56..77b87b9900 100644 --- a/telemetry/telemetrybuffer.go +++ b/telemetry/telemetrybuffer.go @@ -20,7 +20,6 @@ import ( "github.com/Azure/azure-container-networking/common" "github.com/Azure/azure-container-networking/log" "github.com/Azure/azure-container-networking/platform" - "github.com/pkg/errors" ) // TelemetryConfig - telemetry config read by telemetry service @@ -328,22 +327,19 @@ func (tb *TelemetryBuffer) ConnectCNIToTelemetryService(telemetryNumRetries, tel path, dir := getTelemetryServiceDirectory() args := []string{"-d", dir} for attempt := 0; attempt < 2; attempt++ { - if err := tb.startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds, netPlugin, path, args); err != nil { - return errors.Wrap(err, "lock acquire error") - } + tb.startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds, netPlugin, path, args) } return nil } // This function is getting called from ConnectCNIToTelemetryService() in each attempt inside for loop // This function has been created to be able to add defer within the for loop -func (tb *TelemetryBuffer) startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds int, netPlugin *cni.Plugin, path string, args []string) error { +func (tb *TelemetryBuffer) startAndConnectTelemetryService(telemetryNumRetries, telemetryWaitTimeInMilliseconds int, netPlugin *cni.Plugin, path string, args []string) { if err := tb.Connect(); err != nil { log.Logf("Connection to telemetry socket failed: %v", err) if runtime.GOOS == "windows" { if err = netPlugin.LockKeyValueStore(); err != nil { log.Logf("lock acquire error: %v", err) - return errors.Wrap(err, "lock acquire error") } defer func() { if err = netPlugin.UnLockKeyValueStore(); err != nil { @@ -352,18 +348,16 @@ func (tb *TelemetryBuffer) startAndConnectTelemetryService(telemetryNumRetries, }() } if err = tb.Cleanup(FdName); err != nil { - return errors.Wrap(err, "cleanup failed") + log.Logf("cleanup failed: %v", err) } if err = StartTelemetryService(path, args); err != nil { - return errors.Wrap(err, "StartTelemetryService failed") + log.Logf("StartTelemetryService failed: %v", err) } WaitForTelemetrySocket(telemetryNumRetries, time.Duration(telemetryWaitTimeInMilliseconds)) } else { tb.Connected = true log.Logf("Connected to telemetry service") - return nil } - return nil } func getTelemetryServiceDirectory() (path string, dir string) {