/
servicemain.go
356 lines (320 loc) · 16.7 KB
/
servicemain.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2023-present Datadog, Inc.
//go:build windows
// Package servicemain provides Windows Service application helpers
package servicemain
import (
"context"
"errors"
"fmt"
"os"
"strconv"
"strings"
"time"
"unsafe"
"github.com/DataDog/datadog-agent/pkg/util/winutil"
"github.com/DataDog/datadog-agent/pkg/util/winutil/messagestrings"
"golang.org/x/sys/windows"
"golang.org/x/sys/windows/svc"
)
const (
// DefaultHardStopTimeout is a default value. See Service.HardStopTimeout for details.
DefaultHardStopTimeout = 15 * time.Second
// EnvHardStopTimeoutOverride is an environment variable that a user can set
// to override DefaultHardStopTimeout.
EnvHardStopTimeoutOverride = "DD_WINDOWS_SERVICE_STOP_TIMEOUT_SECONDS"
)
// DefaultSettings provides default values to Service implementations when embedded
type DefaultSettings struct{}
// HardStopTimeout provides a default hard stop timeout for Service implementations,
// and allows a user to override the default by setting the DD_WINDOWS_SERVICE_STOP_TIMEOUT_SECONDS
// environment variable.
func (s *DefaultSettings) HardStopTimeout() time.Duration {
timeString, found := os.LookupEnv(EnvHardStopTimeoutOverride)
if !found {
return DefaultHardStopTimeout
}
timeValue, err := strconv.Atoi(timeString)
if err != nil {
return DefaultHardStopTimeout
}
return time.Duration(timeValue) * time.Second
}
// Service defines the interface that applications should implement to run as Windows Services
type Service interface {
// Name() returns the string to be used as the source for event log records.
Name() string
// Init() implements application initialization and is run when the service status is SERVICE_START_PENDING.
// The service status is set to SERVICE_RUNNING when Init() returns successfully.
// See ErrCleanStopAfterInit if you need to exit without calling Service.Run() or throwing an error.
//
// This function will block service tools like PowerShell's `Start-Service` until it returns.
Init() error
// Run() implements all application logic and is run when the service status is SERVICE_RUNNING.
//
// The provided context is cancellable. Run() must monitor ctx.Done() and return as soon as possible
// when it is set. If Run() does not return after the context is set the process will exit after
// the duration returned by HardStopTimeout()
//
// The service will exit when Run() returns. Run() must return for the service status to be be updated to SERVICE_STOPPED.
// If the process exits without setting SERVICE_STOPPED, Service Control Manager (SCM) will treat
// this as an unexpected exit and enter failure/recovery, regardless of the process exit code.
// https://learn.microsoft.com/en-us/windows/win32/api/winsvc/ns-winsvc-service_failure_actionsa
Run(ctx context.Context) error
// Most platforms send SIGKILL if the service does not respond to the initial SIGTERM
// within a set amount of time. The set amount of time differs between platforms but
// is usually also configurable
// - upstart: 5 seconds
// - systemd: 90 seconds
// However, on Windows most service manager tools let the process stay running if it does not stop,
// with different timeouts, and there is no standard way to configure a timeout.
// - PowerShell Restart-Service: (no kill) error after 30 seconds for dependent services
// (no kill) no timeout for a single service
// - net stop: (no kill) error after 20 seconds
// - sc stop: does not wait/block
// - Services.msc: (no kill) error after 125 seconds
// - Host/OS shutdown: shutdown after 20 seconds
// https://learn.microsoft.com/en-us/windows/win32/services/service-control-handler-function
//
// This means that on Windows if our service hangs on stop then it may need to be force killed
// by the user, and may cause uninstall to fail. CM tools like our Ansible plugin use Restart-Service,
// when the configuration changes so it is troublesome for the user if Restart-Service fails.
//
// It seems useful to keep this timeout under the commonly used Windows tools error timeouts, so
// we set it that way for now. However our primary requirement is it be less than the Agent
// installer timeout (3 minutes).
HardStopTimeout() time.Duration
}
// ErrCleanStopAfterInit should be returned from Service.Init() to report SERVICE_RUNNING and then exit without error after
// a delay. See runTimeExitGate for more information on why the delay is necessary.
//
// Example use case, the service detects that it is not configured and wishes to stop running, but does not want
// an error reported, as failing to start may cause commands like `Restart-Service -Force datadogagent` to fail if run
// after modifying the configuration to disable the service.
//
// If your service detects this state in Service.Run() instead then you do not need to do anything, it is handled automatically.
//
// We may be able to remove this and runTimeExitGate if we re-work our current model of change config -> `Restart-Service -Force datadogagent`,
// which we currently expect to trigger all services to restart. Perhaps we can use an agent command instead of PowerShell
// and it can check for a special exit code from each of the services. However we wouldn't be able to use configuration management
// tools' built-in Windows Service commands.
var ErrCleanStopAfterInit = errors.New("the service did not start but requested a clean exit")
// implements golang svc.Handler
type controlHandler struct {
service Service
}
// This function is a clone of "golang.org/x/sys/windows/svc:IsWindowsService", but with a fix
// for Windows containers. Go cloned the .NET implementation of this function, which has since
// been patched to support Windows containers, which don't use Session ID 0 for services.
// https://github.com/dotnet/runtime/pull/74188
// This function can be replaced with go's once go brings in the fix.
func patchedIsWindowsService() (bool, error) {
var currentProcess windows.PROCESS_BASIC_INFORMATION
infoSize := uint32(unsafe.Sizeof(currentProcess))
err := windows.NtQueryInformationProcess(windows.CurrentProcess(), windows.ProcessBasicInformation, unsafe.Pointer(¤tProcess), infoSize, &infoSize)
if err != nil {
return false, err
}
var parentProcess *windows.SYSTEM_PROCESS_INFORMATION
for infoSize = uint32((unsafe.Sizeof(*parentProcess) + unsafe.Sizeof(uintptr(0))) * 1024); ; {
parentProcess = (*windows.SYSTEM_PROCESS_INFORMATION)(unsafe.Pointer(&make([]byte, infoSize)[0]))
err = windows.NtQuerySystemInformation(windows.SystemProcessInformation, unsafe.Pointer(parentProcess), infoSize, &infoSize)
if err == nil {
break
} else if err != windows.STATUS_INFO_LENGTH_MISMATCH {
return false, err
}
}
for ; ; parentProcess = (*windows.SYSTEM_PROCESS_INFORMATION)(unsafe.Pointer(uintptr(unsafe.Pointer(parentProcess)) + uintptr(parentProcess.NextEntryOffset))) {
if parentProcess.UniqueProcessID == currentProcess.InheritedFromUniqueProcessId {
return strings.EqualFold("services.exe", parentProcess.ImageName.String()), nil
}
if parentProcess.NextEntryOffset == 0 {
break
}
}
return false, nil
}
// RunningAsWindowsService returns true if the current process is running as a Windows Service
// and the application should call Run().
func RunningAsWindowsService() bool {
isWindowsService, err := patchedIsWindowsService()
if err != nil {
fmt.Printf("failed to determine if we are running in an interactive session: %v\n", err)
return false
}
return isWindowsService
}
// Run fullfills the contract required by programs running as Windows Services.
// https://learn.microsoft.com/en-us/windows/win32/services/service-programs
//
// Run should be called as early as possible in the process initialization.
// If called too late you may encounter service start timeout errors from SCM.
// If the process exits without calling this function then SCM will erroneously
// report a timeout error, regardless of how fast the process exits.
//
// SCM only gives services 30 seconds (by default) to respond after the process is created.
// Specifically, this timeout refers to calling StartServiceCtrlDispatcher, which is called by
// golang's svc.Run. This timeout is adjustable at the host level with the ServicesPipeTimeout registry value.
// https://learn.microsoft.com/en-us/troubleshoot/windows-server/system-management-components/service-not-start-events-7000-7011-time-out-error
//
// Golang initializes all packages before giving control to main(). This means that if the package
// initialization takes longer than the SCM timeout then SCM will kill our process before main()
// is even called. One observed source of extended process initialization times is dependency packages that call
// golang's user.Current() function to get the current user. If this becomes a recurring issue we may
// want to consider calling StartServiceCtrlDispatcher before the go runtime is initialized, for example
// via a C constructor.
func Run(service Service) {
var s controlHandler
s.service = service
s.eventlog(messagestrings.MSG_SERVICE_STARTING, s.service.Name())
// golang svc.Run calls StartServiceCtrlDispatcher, which does not return until the service
// enters the SERVICE_STOPPED state.
// golang implements its own ServiceMain function which calls RegisterServiceCtrlHandlerEx, it
// is up to our Execute() function to handle ChangeRequest's and update SCM with the service status.
// https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-startservicectrldispatcherw
// https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-registerservicectrlhandlerexw
// https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nc-winsvc-lpservice_main_functiona
// https://learn.microsoft.com/en-us/windows/win32/services/writing-a-service-program-s-main-function
// https://learn.microsoft.com/en-us/windows/win32/services/writing-a-servicemain-function
err := svc.Run(s.service.Name(), &s)
if err != nil {
s.eventlog(messagestrings.MSG_SERVICE_FAILED, err.Error())
return
}
// svc.Run() can return before Execute() ends if there is an error, but since we trigger
// process exit when svc.Run() returns its okay if we leak the goroutine.
s.eventlog(messagestrings.MSG_SERVICE_STOPPED, s.service.Name())
}
// runTimeExitGate is used to ensure the service exits without error on short-lived successful stops by keeping
// the service in the `SERVICE_RUNNING` state long enough for a service manager to consider the start successful.
//
// It should be called when the service enters the `SERVICE_RUNNING` state and should be used to delay
// the service exit until the timer expires.
//
// On Windows, the Service Control Manager (SCM) requires that dependent services stop. This means that when running
// `Restart-Service datadogagent`, Windows will try to stop the Process Agent, and then to be helpful it will immediately start it again.
// However, if Process Agent is not configured to be running it will exit immediately, which `Restart-Service` will report as an error.
// To avoid the error on a successful exit we must ensure that we are in the RUNNING state long enough for `Restart-Service` or other
// tools to consider the restart successful.
//
// See also ErrCleanStopAfterInit
func runTimeExitGate() <-chan time.Time {
return time.After(5 * time.Second)
}
func (s *controlHandler) eventlog(msgnum uint32, arg string) {
winutil.LogEventViewer(s.service.Name(), msgnum, arg)
}
// Execute is called by golang svc.Run and is responsible for handling the control requests and state transitions for the service
// golang.org/x/sys/windows/svc contains the actual control handler callback and status handle, and communicates with
// our Execute() function via the provided channels.
// https://learn.microsoft.com/en-us/windows/win32/services/service-status-transitions
//
//nolint:revive // TODO(WINA) Fix revive linter
func (s *controlHandler) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (ssec bool, errno uint32) {
// first thing we must do is inform SCM that we are SERVICE_START_PENDING.
// We keep the commands accepted list empty so SCM knows to wait until we start or stop and
// won't send any signals. This way we don't have to handle stop controls in the middle of starting.
// https://learn.microsoft.com/en-us/windows/win32/services/service-servicemain-function
changes <- svc.Status{State: svc.StartPending}
executeRun := true
err := s.service.Init()
if err != nil {
s.eventlog(messagestrings.MSG_AGENT_START_FAILURE, err.Error())
if errors.Is(err, ErrCleanStopAfterInit) {
// Service requested to exit successfully. We must enter SERVICE_RUNNING state and stay there
// for a period of time to ensure the service manager treats the start as successful.
// See ErrCleanStopAfterInit and runTimeExitGate for more information.
// We must still process control requests, in case we receive a STOP signal. If we don't
// respond to a STOP signal within a few seconds it will fail. So continue and enter
// RUNNING state and start the control handler, but don't execute Service.Run().
executeRun = false
} else {
return
}
}
// Now tell SCM that we are SERVICE_RUNNING
// per MSDN: For best system performance, your application should enter the running state within 25-100 milliseconds.
const runningCmdsAccepted = svc.AcceptStop | svc.AcceptShutdown | svc.AcceptPreShutdown
changes <- svc.Status{State: svc.Running, Accepts: runningCmdsAccepted}
s.eventlog(messagestrings.MSG_SERVICE_STARTED, s.service.Name())
// make sure that we set state to SERVICE_STOP_PENDING when we return so SCM knows
// not to send anymore control requests.
defer func() {
changes <- svc.Status{State: svc.StopPending}
}()
// context for Run()
ctx, cancelfunc := context.WithCancel(context.Background())
defer cancelfunc()
// context for exit timeout
cleanExitCtx, cancelCleanExit := context.WithCancel(context.Background())
defer cancelCleanExit()
// goroutine to handle service control requests
// https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nc-winsvc-lphandler_function
go s.controlHandlerLoop(cancelfunc, cancelCleanExit, r, changes)
// Now that we are in SERVICE_RUNNING state, start the exit gate timer.
exitGate := runTimeExitGate()
if executeRun {
err = nil
done := make(chan struct{})
go func() {
defer close(done)
// Run the actual agent/service
err = s.service.Run(ctx)
if err != nil {
s.eventlog(messagestrings.MSG_SERVICE_FAILED, err.Error())
}
}()
select {
case <-done:
if err != nil {
// since exitGate is meant to avoid an error, if we are returning
// with an error then we can skip the exitGate.
return
}
case <-cleanExitCtx.Done():
s.eventlog(messagestrings.MSG_SERVICE_FAILED, "service did not cleanly shutdown in a timely manner, hard stopping service")
// since exitGate is meant to avoid an error, if we are returning
// with an error then we can skip the exitGate.
return
}
}
// Run was skipped or returned success, block to ensure the service is alive long enough to be considered successful.
<-exitGate
// golang sets the status to SERVICE_STOPPED with ssec,errno before returning from svc.Run()
// so we don't need to do so here.
return
}
func (s *controlHandler) controlHandlerLoop(cancelFunc context.CancelFunc, cancelCleanExit context.CancelFunc, r <-chan svc.ChangeRequest, changes chan<- svc.Status) {
for c := range r {
switch c.Cmd {
case svc.Interrogate:
// current status query
changes <- c.CurrentStatus
case svc.Stop, svc.PreShutdown, svc.Shutdown:
// Must report SERVICE_STOP_PENDING within a few seconds of receiving the control request
// or else the service manager may consider the stop a failure.
changes <- svc.Status{State: svc.StopPending}
// stop
s.eventlog(messagestrings.MSG_RECEIVED_STOP_SVC_COMMAND, s.service.Name())
cancelFunc()
// We set SERVICE_STOP_PENDING, so SCM won't send anymore control requests
// Start our exit timeout timer
go s.terminateProcessOnTimeout(cancelCleanExit)
return
default:
// unexpected control
s.eventlog(messagestrings.MSG_UNEXPECTED_CONTROL_REQUEST, fmt.Sprintf("%d", c.Cmd))
}
}
}
// Does not directly call os.Exit/TerminateProcess.
// We cancel a context that causes controlHandler.Execute to return and cause the service
// to properly shutdown (from Windows perspective), which eventually returns to main to exit.
// Any shutdown operations the Agent may be in the middle of will be abruptly halted, like with a SIGKILL.
func (s *controlHandler) terminateProcessOnTimeout(cancelCleanExit context.CancelFunc) {
<-time.After(time.Duration(s.service.HardStopTimeout()))
cancelCleanExit()
}