diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 97882624e4e..936b5790b11 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -14,16 +14,55 @@ import ( "path/filepath" "strconv" "strings" + "time" + "github.com/Azure/agentbaker/aks-node-controller/helpers" "github.com/Azure/agentbaker/aks-node-controller/parser" "github.com/Azure/agentbaker/aks-node-controller/pkg/nodeconfigutils" "github.com/fsnotify/fsnotify" ) type App struct { - // cmdRunner is a function that runs the given command. + // cmdRun is a function that runs the given command. // the goal of this field is to make it easier to test the app by mocking the command runner. - cmdRunner func(cmd *exec.Cmd) error + cmdRun func(cmd *exec.Cmd) error + eventLogger *helpers.EventLogger +} + +// commandMetadata holds all metadata for a command in one place. +type commandMetadata struct { + taskName string + handler func(*App, context.Context, []string) error +} + +// getCommandRegistry returns the command registry mapping command names to their metadata. +// Adding a new command only requires adding one entry here. +func getCommandRegistry() map[string]commandMetadata { + return map[string]commandMetadata{ + "provision": { + taskName: "Provision", + handler: func(a *App, ctx context.Context, args []string) error { + provisionResult, err := a.runProvision(ctx, args[2:]) + // Always notify after provisioning attempt (success is a no-op inside notifier) + a.writeCompleteFileOnError(provisionResult, err) + return err + }, + }, + "provision-wait": { + taskName: "ProvisionWait", + handler: func(a *App, ctx context.Context, args []string) error { + provisionStatusFiles := ProvisionStatusFiles{ + ProvisionJSONFile: provisionJSONFilePath, + ProvisionCompleteFile: provisionCompleteFilePath, + } + provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) + //nolint:forbidigo // stdout is part of the interface + fmt.Println(provisionOutput) + slog.Info("provision-wait finished", "provisionOutput", provisionOutput) + return err + }, + }, + } } // provision.json values are emitted as strings by the shell jq invocation. @@ -64,25 +103,31 @@ func (a *App) Run(ctx context.Context, args []string) int { } func (a *App) run(ctx context.Context, args []string) error { - if len(args) < 2 { + command := "" + if len(args) >= 2 { + command = args[1] + } + if command == "" { return errors.New("missing command argument") } - switch args[1] { - case "provision": - provisionResult, err := a.runProvision(ctx, args[2:]) - // Always notify after provisioning attempt (success is a no-op inside notifier) - a.writeCompleteFileOnError(provisionResult, err) - return err - case "provision-wait": - provisionStatusFiles := ProvisionStatusFiles{ProvisionJSONFile: provisionJSONFilePath, ProvisionCompleteFile: provisionCompleteFilePath} - provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) - //nolint:forbidigo // stdout is part of the interface - fmt.Println(provisionOutput) - slog.Info("provision-wait finished", "provisionOutput", provisionOutput) - return err - default: - return fmt.Errorf("unknown command: %s", args[1]) + + cmd, ok := getCommandRegistry()[command] + if !ok { + return fmt.Errorf("unknown command: %s", command) + } + + startTime := time.Now() + a.eventLogger.LogEvent(cmd.taskName, "Starting", helpers.EventLevelInformational, startTime, startTime) + + err := cmd.handler(a, ctx, args) + endTime := time.Now() + if err != nil { + message := fmt.Sprintf("aks-node-controller exited with error %s", err.Error()) + a.eventLogger.LogEvent(cmd.taskName, message, helpers.EventLevelError, startTime, endTime) + } else { + a.eventLogger.LogEvent(cmd.taskName, "Completed", helpers.EventLevelInformational, startTime, endTime) } + return err } func (a *App) Provision(ctx context.Context, flags ProvisionFlags) (*ProvisionResult, error) { @@ -129,7 +174,7 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) (*ProvisionRe var stdoutBuf, stderrBuf bytes.Buffer cmd.Stdout = io.MultiWriter(os.Stdout, &stdoutBuf) cmd.Stderr = io.MultiWriter(os.Stderr, &stderrBuf) - err = a.cmdRunner(cmd) + err = a.cmdRun(cmd) exitCode := -1 if cmd.ProcessState != nil { exitCode = cmd.ProcessState.ExitCode() @@ -174,7 +219,7 @@ func (a *App) runProvision(ctx context.Context, args []string) (*ProvisionResult return provisionResult, errors.New(provisionResult.Error) } if *dryRun { - a.cmdRunner = cmdRunnerDryRun + a.cmdRun = cmdRunnerDryRun } return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) } diff --git a/aks-node-controller/app_test.go b/aks-node-controller/app_test.go index 77bf9565314..4ae404d4b4c 100644 --- a/aks-node-controller/app_test.go +++ b/aks-node-controller/app_test.go @@ -11,341 +11,248 @@ import ( "testing" "time" + "github.com/Azure/agentbaker/aks-node-controller/helpers" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) -// MockCmdRunner is a simple mock for cmdRunner. -type MockCmdRunner struct { - RunFunc func(cmd *exec.Cmd) error +type testExitError struct { + Code int } -func (m *MockCmdRunner) Run(cmd *exec.Cmd) error { - if m.RunFunc != nil { - return m.RunFunc(cmd) - } - return nil +func (e *testExitError) Error() string { + return "exit status " + strconv.Itoa(e.ExitCode()) } -type ExitError struct { - Code int +func (e *testExitError) ExitCode() int { + return e.Code } -func (e *ExitError) Error() string { - return "exit status " + strconv.Itoa(e.ExitCode()) +type TestAppConfig struct { + RunFunc func(*exec.Cmd) error } -func (e *ExitError) ExitCode() int { - return e.Code +type TestApp struct { + App *App + eventLogger *helpers.EventLogger } -func TestApp_Run(t *testing.T) { - tests := []struct { - name string - args []string - setupMocks func(*MockCmdRunner) - wantExit int - }{ - { - name: "missing command argument", - args: []string{"aks-node-controller"}, - wantExit: 1, - }, - { - name: "unknown command", - args: []string{"aks-node-controller", "unknown"}, - wantExit: 1, - }, - { - name: "provision command with missing flag", - args: []string{"provision"}, - wantExit: 1, - }, - { - name: "provision command with valid flag", - args: []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}, - setupMocks: func(mc *MockCmdRunner) { - mc.RunFunc = func(cmd *exec.Cmd) error { - return nil - } - }, - wantExit: 0, - }, - { - name: "provision command with command runner error", - args: []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}, - setupMocks: func(mc *MockCmdRunner) { - mc.RunFunc = func(cmd *exec.Cmd) error { - return &ExitError{Code: 666} - } - }, - wantExit: 666, +func NewTestApp(t *testing.T, cfg TestAppConfig) *TestApp { + eventsDir := t.TempDir() + runFunc := cfg.RunFunc + if runFunc == nil { + runFunc = func(*exec.Cmd) error { return nil } + } + eventLogger := helpers.NewEventLogger(eventsDir) + return &TestApp{ + eventLogger: eventLogger, + App: &App{ + cmdRun: runFunc, + eventLogger: eventLogger, }, } +} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mc := new(MockCmdRunner) - if tt.setupMocks != nil { - tt.setupMocks(mc) - } +func TestApp_Run(t *testing.T) { + t.Run("missing command argument", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller"}) + assert.Equal(t, 1, exitCode) + }) - app := &App{ - cmdRunner: mc.Run, - } + t.Run("unknown command", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "unknown"}) + assert.Equal(t, 1, exitCode) + }) + + t.Run("provision command with missing flag", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision"}) + assert.Equal(t, 1, exitCode) + }) - exitCode := app.Run(context.Background(), tt.args) - assert.Equal(t, tt.wantExit, exitCode) + t.Run("provision command with valid flag", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}) + assert.Equal(t, 0, exitCode) + + events := tt.eventLogger.Events() + assert.Len(t, events, 2) + assert.Contains(t, events[0].Message, "Starting") + assert.Contains(t, events[1].Message, "Completed") + }) + + t.Run("provision command with command runner error", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{ + RunFunc: func(*exec.Cmd) error { return &testExitError{Code: 666} }, }) - } + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}) + assert.Equal(t, 666, exitCode) + + events := tt.eventLogger.Events() + assert.Len(t, events, 2) + assert.Equal(t, "Error", events[1].EventLevel) + }) } func TestApp_Provision(t *testing.T) { - tests := []struct { - name string - flags ProvisionFlags - setupMocks func(*MockCmdRunner) - wantErr bool - }{ - { - name: "valid provision config", - flags: ProvisionFlags{ProvisionConfig: "parser/testdata/test_aksnodeconfig.json"}, - wantErr: false, - }, - { - name: "invalid provision config path", - flags: ProvisionFlags{ProvisionConfig: "invalid.json"}, - wantErr: true, - }, - { - name: "command runner error", - flags: ProvisionFlags{ProvisionConfig: "parser/testdata/test_aksnodeconfig.json"}, - setupMocks: func(mc *MockCmdRunner) { - mc.RunFunc = func(cmd *exec.Cmd) error { return errors.New("command runner error") } - }, - wantErr: true, - }, - } + t.Run("valid provision config", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + _, err := tt.App.Provision(context.Background(), ProvisionFlags{ProvisionConfig: "parser/testdata/test_aksnodeconfig.json"}) + assert.NoError(t, err) + }) - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mc := &MockCmdRunner{} - if tt.setupMocks != nil { - tt.setupMocks(mc) - } - app := &App{cmdRunner: mc.Run} - _, err := app.Provision(context.Background(), tt.flags) - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } + t.Run("invalid provision config path", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + _, err := tt.App.Provision(context.Background(), ProvisionFlags{ProvisionConfig: "invalid.json"}) + assert.Error(t, err) + }) + + t.Run("command runner error", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{ + RunFunc: func(*exec.Cmd) error { return errors.New("command runner error") }, }) - } + _, err := tt.App.Provision(context.Background(), ProvisionFlags{ProvisionConfig: "parser/testdata/test_aksnodeconfig.json"}) + assert.Error(t, err) + }) } func TestApp_Provision_DryRun(t *testing.T) { - app := &App{cmdRunner: cmdRunner} - result := app.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json", "--dry-run"}) - assert.Equal(t, 0, result) - if reflect.ValueOf(app.cmdRunner).Pointer() != reflect.ValueOf(cmdRunnerDryRun).Pointer() { + tt := NewTestApp(t, TestAppConfig{}) + tt.App.cmdRun = cmdRunner // Use real cmdRunner to test dry-run override + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json", "--dry-run"}) + assert.Equal(t, 0, exitCode) + if reflect.ValueOf(tt.App.cmdRun).Pointer() != reflect.ValueOf(cmdRunnerDryRun).Pointer() { t.Fatal("app.cmdRunner is expected to be cmdRunnerDryRun") } } func TestApp_ProvisionWait(t *testing.T) { testData := `{"ExitCode": "0", "Output": "hello world", "Error": ""}` - tests := []struct { - name string - wantsErr bool - errString string - setup func(ProvisionStatusFiles) - }{ - { - name: "event path (file created after call)", - setup: func(provisionStatusFiles ProvisionStatusFiles) { - // This goroutine simulates an external process writing the files after a short delay. - // It's running asynchronously from the main test flow. - go func() { - time.Sleep(150 * time.Millisecond) - _ = os.WriteFile(provisionStatusFiles.ProvisionJSONFile, []byte(testData), 0644) - _, _ = os.Create(provisionStatusFiles.ProvisionCompleteFile) - }() - }, - }, - { - name: "fast path (file exists before call)", - setup: func(provisionStatusFiles ProvisionStatusFiles) { - _ = os.WriteFile(provisionStatusFiles.ProvisionJSONFile, []byte(testData), 0644) - _, _ = os.Create(provisionStatusFiles.ProvisionCompleteFile) // pre-create to trigger immediate return - }, - }, - { - name: "provision completion with failure ExitCode", - wantsErr: true, - errString: "provision failed", - setup: func(provisionStatusFiles ProvisionStatusFiles) { - failJSON := `{"ExitCode": "7", "Error": "boom", "Output": "trace"}` - _ = os.WriteFile(provisionStatusFiles.ProvisionJSONFile, []byte(failJSON), 0644) - _, _ = os.Create(provisionStatusFiles.ProvisionCompleteFile) - }, - }, - { - name: "timeout waiting for completion", - wantsErr: true, - errString: "context deadline exceeded waiting for provision complete", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mc := &MockCmdRunner{} - tempDir, err := os.MkdirTemp("", "provisiontest") - assert.NoError(t, err) - tempFile := filepath.Join(tempDir, "testfile.txt") - completeFile := filepath.Join(tempDir, "provision.complete") - defer os.RemoveAll(tempDir) - - p := ProvisionStatusFiles{ProvisionJSONFile: tempFile, ProvisionCompleteFile: completeFile} - ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) - defer cancel() - app := &App{cmdRunner: mc.Run} - if tt.setup != nil { - tt.setup(p) - } - - data, err := app.ProvisionWait(ctx, p) - if tt.wantsErr { - assert.Error(t, err) - assert.Contains(t, err.Error(), tt.errString) - } else { - assert.NoError(t, err) - assert.Equal(t, testData, data) - } - }) - } -} -func TestApp_Run_Integration(t *testing.T) { - t.Run("success case", func(t *testing.T) { - mc := &MockCmdRunner{} - app := &App{cmdRunner: mc.Run} - // Use a valid provision config file from testdata - exitCode := app.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}) - assert.Equal(t, 0, exitCode) - }) + t.Run("event path (file created after call)", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tempDir := t.TempDir() + p := ProvisionStatusFiles{ + ProvisionJSONFile: filepath.Join(tempDir, "provision.json"), + ProvisionCompleteFile: filepath.Join(tempDir, "provision.complete"), + } - t.Run("failure case - unknown command", func(t *testing.T) { - mc := &MockCmdRunner{} - app := &App{cmdRunner: mc.Run} - exitCode := app.Run(context.Background(), []string{"aks-node-controller", "unknown"}) - assert.Equal(t, 1, exitCode) + go func() { + time.Sleep(150 * time.Millisecond) + _ = os.WriteFile(p.ProvisionJSONFile, []byte(testData), 0644) + _, _ = os.Create(p.ProvisionCompleteFile) + }() + + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + data, err := tt.App.ProvisionWait(ctx, p) + assert.NoError(t, err) + assert.Equal(t, testData, data) }) - t.Run("failure case - missing command argument", func(t *testing.T) { - mc := &MockCmdRunner{} - app := &App{cmdRunner: mc.Run} - exitCode := app.Run(context.Background(), []string{"aks-node-controller"}) - assert.Equal(t, 1, exitCode) + t.Run("fast path (file exists before call)", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tempDir := t.TempDir() + p := ProvisionStatusFiles{ + ProvisionJSONFile: filepath.Join(tempDir, "provision.json"), + ProvisionCompleteFile: filepath.Join(tempDir, "provision.complete"), + } + _ = os.WriteFile(p.ProvisionJSONFile, []byte(testData), 0644) + _, _ = os.Create(p.ProvisionCompleteFile) + + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + data, err := tt.App.ProvisionWait(ctx, p) + assert.NoError(t, err) + assert.Equal(t, testData, data) }) - t.Run("failure case - command runner returns ExitError", func(t *testing.T) { - mc := &MockCmdRunner{ - RunFunc: func(cmd *exec.Cmd) error { - return &ExitError{Code: 42} - }, + t.Run("provision completion with failure ExitCode", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tempDir := t.TempDir() + p := ProvisionStatusFiles{ + ProvisionJSONFile: filepath.Join(tempDir, "provision.json"), + ProvisionCompleteFile: filepath.Join(tempDir, "provision.complete"), } - app := &App{cmdRunner: mc.Run} - exitCode := app.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}) - assert.Equal(t, 42, exitCode) + failJSON := `{"ExitCode": "7", "Error": "boom", "Output": "trace"}` + _ = os.WriteFile(p.ProvisionJSONFile, []byte(failJSON), 0644) + _, _ = os.Create(p.ProvisionCompleteFile) + + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + _, err := tt.App.ProvisionWait(ctx, p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "provision failed") }) - t.Run("failure case - command runner returns generic error", func(t *testing.T) { - mc := &MockCmdRunner{ - RunFunc: func(cmd *exec.Cmd) error { - return errors.New("generic error") - }, + t.Run("timeout waiting for completion", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tempDir := t.TempDir() + p := ProvisionStatusFiles{ + ProvisionJSONFile: filepath.Join(tempDir, "provision.json"), + ProvisionCompleteFile: filepath.Join(tempDir, "provision.complete"), } - app := &App{cmdRunner: mc.Run} - exitCode := app.Run(context.Background(), []string{"aks-node-controller", "provision", "--provision-config=parser/testdata/test_aksnodeconfig.json"}) - assert.Equal(t, 1, exitCode) + + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + _, err := tt.App.ProvisionWait(ctx, p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "context deadline exceeded") }) } func Test_readAndEvaluateProvision(t *testing.T) { - type testCase struct { - name string - fileContent string // raw content to place in file (empty => file absent) - createFile bool // whether to create the file - expectErrSub string // substring that should appear in error; empty means success expected - expectContains string // substring that must appear in successful content - } - - tests := []testCase{ - { - name: "valid provision file", - createFile: true, - fileContent: `{"ExitCode":"0","Output":"ok","Error":"","ExecDuration":"1"}`, - expectContains: `"ExitCode":"0"`, - }, - { - name: "missing provision file", - createFile: false, - expectErrSub: "no such file", - }, - { - name: "invalid provision file (bad JSON)", - createFile: true, - fileContent: `not-json`, - expectErrSub: "invalid character", - }, - { - name: "non-zero ExitCode returns error", - createFile: true, - fileContent: `{"ExitCode":"7","Output":"boom","Error":"bad"}`, - expectErrSub: "provision failed", - }, - { - name: "invalid ExitCode returns error", - createFile: true, - fileContent: `{"ExitCode":"unknown","Output":"boom","Error":"bad"}`, - expectErrSub: "invalid ExitCode", - }, - { - name: "missing ExitCode returns error", - createFile: true, - fileContent: `{"Output":"boom","Error":"bad"}`, - expectErrSub: "missing ExitCode", - }, - } - writeTemp := func(t *testing.T, content string) string { t.Helper() f, err := os.CreateTemp(t.TempDir(), "provision_*.json") - assert.NoError(t, err) - _, errWS := f.WriteString(content) - assert.NoError(t, errWS) + require.NoError(t, err) + _, err = f.WriteString(content) + require.NoError(t, err) f.Close() return f.Name() } - for _, tc := range tests { - tc := tc - t.Run(tc.name, func(t *testing.T) { - p := filepath.Join(t.TempDir(), "does_not_exist.json") - if tc.createFile { - p = writeTemp(t, tc.fileContent) - } - got, err := readAndEvaluateProvision(p) - if tc.expectErrSub != "" { // expected error - assert.Error(t, err, "expected an error") - if err != nil { // avoid panic if err is nil - assert.Contains(t, err.Error(), tc.expectErrSub, "error should contain substring") - } - } else { // success - assert.NoError(t, err, "unexpected error") - if tc.expectContains != "" { - assert.Contains(t, got, tc.expectContains, "content should contain substring") - } - } - }) - } + t.Run("valid provision file", func(t *testing.T) { + p := writeTemp(t, `{"ExitCode":"0","Output":"ok","Error":""}`) + got, err := readAndEvaluateProvision(p) + assert.NoError(t, err) + assert.Contains(t, got, `"ExitCode":"0"`) + }) + + t.Run("missing provision file", func(t *testing.T) { + p := filepath.Join(t.TempDir(), "does_not_exist.json") + _, err := readAndEvaluateProvision(p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "no such file") + }) + + t.Run("invalid provision file (bad JSON)", func(t *testing.T) { + p := writeTemp(t, `not-json`) + _, err := readAndEvaluateProvision(p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid character") + }) + + t.Run("non-zero ExitCode returns error", func(t *testing.T) { + p := writeTemp(t, `{"ExitCode":"7","Output":"boom","Error":"bad"}`) + _, err := readAndEvaluateProvision(p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "provision failed") + }) + + t.Run("invalid ExitCode returns error", func(t *testing.T) { + p := writeTemp(t, `{"ExitCode":"unknown","Output":"boom","Error":"bad"}`) + _, err := readAndEvaluateProvision(p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid ExitCode") + }) + + t.Run("missing ExitCode returns error", func(t *testing.T) { + p := writeTemp(t, `{"Output":"boom","Error":"bad"}`) + _, err := readAndEvaluateProvision(p) + assert.Error(t, err) + assert.Contains(t, err.Error(), "missing ExitCode") + }) } diff --git a/aks-node-controller/helpers/guestagent.go b/aks-node-controller/helpers/guestagent.go new file mode 100644 index 00000000000..364fc5a2255 --- /dev/null +++ b/aks-node-controller/helpers/guestagent.go @@ -0,0 +1,136 @@ +package helpers + +import ( + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "time" +) + +// EventLevel represents the severity level of a guest agent event. +type EventLevel string + +const ( + // EventLevelInformational indicates a successful operation or informational message. + EventLevelInformational EventLevel = "Informational" + // EventLevelError indicates a failure or error condition. + EventLevelError EventLevel = "Error" +) + +// GuestAgentEvent represents an event to be logged for the Azure VM guest agent. +type GuestAgentEvent struct { + Timestamp string `json:"Timestamp"` + OperationId string `json:"OperationId"` + Version string `json:"Version"` + TaskName string `json:"TaskName"` + EventLevel string `json:"EventLevel"` + Message string `json:"Message"` + EventPid string `json:"EventPid"` + EventTid string `json:"EventTid"` +} + +// EventLogger writes guest agent events to a directory. +type EventLogger struct { + Dir string +} + +// NewEventLogger creates an EventLogger that writes to the specified directory. +func NewEventLogger(dir string) *EventLogger { + return &EventLogger{Dir: dir} +} + +// LogEvent creates an event file for the Azure VM guest agent. +// +// The implementation follows the established bash pattern used across the codebase +// for event contents, with one intentional divergence in the filename scheme: +// - Filename: Uses current time in nanoseconds to ensure uniqueness. Most bash +// event emitters use millisecond-based filenames (e.g., `date +%s%3N`); this +// implementation uses nanoseconds for greater collision resistance. +// - Timestamp: Event start time in format "2006-01-02 15:04:05.000" +// - OperationId: Event end time in format "2006-01-02 15:04:05.000" +// - Message: Includes timing information (startTime, endTime, durationMs) +func (l *EventLogger) LogEvent(taskName, message string, eventLevel EventLevel, startTime, endTime time.Time) { + if err := os.MkdirAll(l.Dir, 0755); err != nil { + slog.Error("failed to create events logging directory", "path", l.Dir, "error", err) + return + } + + // Use nanosecond timestamp as filename, based on current time to ensure uniqueness + // This provides better collision avoidance than milliseconds + eventsFileName := fmt.Sprintf("%d.json", time.Now().UnixNano()) + eventFilePath := filepath.Join(l.Dir, eventsFileName) + + durationMs := endTime.Sub(startTime).Milliseconds() + timingInfo := fmt.Sprintf("startTime=%s endTime=%s durationMs=%d", + startTime.Format("2006-01-02 15:04:05.000"), + endTime.Format("2006-01-02 15:04:05.000"), + durationMs, + ) + fullMessage := message + if fullMessage == "" { + fullMessage = timingInfo + } else { + fullMessage = fmt.Sprintf("%s | %s", message, timingInfo) + } + + operationID := endTime.Format("2006-01-02 15:04:05.000") + + event := GuestAgentEvent{ + Timestamp: startTime.Format("2006-01-02 15:04:05.000"), + OperationId: operationID, + Version: "1.23", + TaskName: "AKS.AKSNodeController." + taskName, + EventLevel: string(eventLevel), + Message: fullMessage, + EventPid: "0", + EventTid: "0", + } + + data, err := json.Marshal(event) + if err != nil { + slog.Error("failed to marshal guest agent event", "error", err) + return + } + + // Event log files need to be readable by Azure monitoring services. + // #nosec G306 -- Operational event data without sensitive information + if err := os.WriteFile(eventFilePath, data, 0644); err != nil { + slog.Error("failed to write guest agent event file", "path", eventFilePath, "error", err) + } +} + +// Events reads all guest agent event files from the directory. +// Events are returned in filename order (which corresponds to creation time since +// filenames are nanosecond timestamps). This method is primarily useful for testing. +func (l *EventLogger) Events() []GuestAgentEvent { + files, err := os.ReadDir(l.Dir) + if err != nil { + slog.Error("failed to read events directory", "path", l.Dir, "error", err) + return nil + } + + var events []GuestAgentEvent + for _, file := range files { + if file.IsDir() || filepath.Ext(file.Name()) != ".json" { + continue + } + + data, err := os.ReadFile(filepath.Join(l.Dir, file.Name())) + if err != nil { + slog.Error("failed to read event file", "file", file.Name(), "error", err) + return nil + } + + var event GuestAgentEvent + if err := json.Unmarshal(data, &event); err != nil { + slog.Error("failed to unmarshal event file", "file", file.Name(), "error", err) + return nil + } + + events = append(events, event) + } + + return events +} diff --git a/aks-node-controller/helpers/guestagent_test.go b/aks-node-controller/helpers/guestagent_test.go new file mode 100644 index 00000000000..79001ea2805 --- /dev/null +++ b/aks-node-controller/helpers/guestagent_test.go @@ -0,0 +1,34 @@ +package helpers + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestEventLogger_Log(t *testing.T) { + logger := NewEventLogger(t.TempDir()) + logger.LogEvent("Provision", "Starting", EventLevelError, + time.Date(2099, 2, 3, 10, 30, 45, 0, time.UTC), + time.Date(2099, 2, 3, 10, 35, 50, 0, time.UTC)) + logger.LogEvent("Provision", "Completed", EventLevelInformational, time.Now(), time.Now()) + + events := logger.Events() + require.Len(t, events, 2) + + assert.Equal(t, "AKS.AKSNodeController.Provision", events[0].TaskName) + assert.Equal(t, "Error", events[0].EventLevel) + assert.Contains(t, events[0].Message, "Starting") + assert.Contains(t, events[0].Message, "durationMs=305000") + assert.Equal(t, "2099-02-03 10:30:45.000", events[0].Timestamp) + assert.Equal(t, "2099-02-03 10:35:50.000", events[0].OperationId) + assert.Equal(t, "1.23", events[0].Version) +} + +func TestEventLogger_Events_EmptyDirectory(t *testing.T) { + logger := NewEventLogger(t.TempDir()) + events := logger.Events() + assert.Empty(t, events) +} diff --git a/aks-node-controller/main.go b/aks-node-controller/main.go index 07a65e5c0d3..be34e75f27c 100644 --- a/aks-node-controller/main.go +++ b/aks-node-controller/main.go @@ -7,14 +7,21 @@ import ( "log/slog" "os" "path/filepath" + + "github.com/Azure/agentbaker/aks-node-controller/helpers" ) func main() { // defer calls are not executed on os.Exit logCleanup := configureLogging() - app := App{cmdRunner: cmdRunner} + app := App{ + cmdRun: cmdRunner, + eventLogger: helpers.NewEventLogger("/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events"), + } + exitCode := app.Run(context.Background(), os.Args) logCleanup() + os.Exit(exitCode) } diff --git a/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh b/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh index a78d98321e6..b310e44195c 100644 --- a/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh +++ b/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh @@ -1,8 +1,8 @@ #!/bin/bash set -uo pipefail -BIN_PATH="/opt/azure/containers/aks-node-controller" -CONFIG_PATH="/opt/azure/containers/aks-node-controller-config.json" +BIN_PATH="${BIN_PATH:-/opt/azure/containers/aks-node-controller}" +CONFIG_PATH="${CONFIG_PATH:-/opt/azure/containers/aks-node-controller-config.json}" LOGGER_TAG="aks-node-controller-wrapper" log() { @@ -12,6 +12,9 @@ log() { echo "$message" } +# this is to ensure that shellspec won't interpret any further lines below +${__SOURCED__:+return} + log "Launching aks-node-controller with config ${CONFIG_PATH}" "$BIN_PATH" provision --provision-config="$CONFIG_PATH" & child_pid=$!