-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
gpu_monitoring.go
81 lines (67 loc) · 2.13 KB
/
gpu_monitoring.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.
//go:build linux
package modules
import (
"fmt"
"net/http"
"time"
"go.uber.org/atomic"
"github.com/DataDog/datadog-agent/cmd/system-probe/api/module"
"github.com/DataDog/datadog-agent/cmd/system-probe/config"
sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types"
"github.com/DataDog/datadog-agent/cmd/system-probe/utils"
"github.com/DataDog/datadog-agent/pkg/gpu"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
var _ module.Module = &GPUMonitoringModule{}
var gpuMonitoringConfigNamespaces = []string{gpu.GPUConfigNS}
// GPUMonitoring Factory
var GPUMonitoring = module.Factory{
Name: config.GPUMonitoringModule,
ConfigNamespaces: gpuMonitoringConfigNamespaces,
Fn: func(_ *sysconfigtypes.Config, _ module.FactoryDependencies) (module.Module, error) {
t, err := gpu.NewProbe(gpu.NewConfig(), nil)
if err != nil {
return nil, fmt.Errorf("unable to start GPU monitoring: %w", err)
}
return &GPUMonitoringModule{
Probe: t,
lastCheck: atomic.NewInt64(0),
}, nil
},
NeedsEBPF: func() bool {
return true
},
}
// GPUMonitoringModule is a module for GPU monitoring
type GPUMonitoringModule struct {
*gpu.Probe
lastCheck *atomic.Int64
}
// Register registers the GPU monitoring module
func (t *GPUMonitoringModule) Register(httpMux *module.Router) error {
httpMux.HandleFunc("/check", func(w http.ResponseWriter, _ *http.Request) {
t.lastCheck.Store(time.Now().Unix())
stats, err := t.Probe.GetAndFlush()
if err != nil {
log.Errorf("Error getting GPU stats: %v", err)
w.WriteHeader(500)
return
}
utils.WriteAsJSON(w, stats)
})
return nil
}
// GetStats returns the last check time
func (t *GPUMonitoringModule) GetStats() map[string]interface{} {
return map[string]interface{}{
"last_check": t.lastCheck.Load(),
}
}
// Close closes the GPU monitoring module
func (t *GPUMonitoringModule) Close() {
t.Probe.Close()
}