forked from containers/podman
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcontainer_top_linux.go
421 lines (374 loc) · 12.4 KB
/
container_top_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
//go:build !remote && linux && cgo
package libpod
import (
"bufio"
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"syscall"
"unsafe"
"github.com/containers/podman/v5/libpod/define"
"github.com/containers/podman/v5/pkg/rootless"
"github.com/containers/psgo"
"github.com/containers/storage/pkg/reexec"
"github.com/google/shlex"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
/*
#include <stdlib.h>
void fork_exec_ps();
void create_argv(int len);
void set_argv(int pos, char *arg);
void set_userns();
*/
import "C"
const (
// podmanTopCommand is the reexec key to safely setup the environment for ps to be executed
podmanTopCommand = "podman-top"
// podmanTopExitCode is a special exec code to signal that podman failed to to something in
// reexec command not ps. This is used to give a better error.
podmanTopExitCode = 255
)
func init() {
reexec.Register(podmanTopCommand, podmanTopMain)
}
// podmanTopMain - main function for the reexec
func podmanTopMain() {
if err := podmanTopInner(); err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(podmanTopExitCode)
}
os.Exit(0)
}
// podmanTopInner os.Args = {command name} {pid} {userns(1/0)} {psPath} [args...]
// We are rexxec'd in a new mountns, then we need to set some security settings in order
// to safely execute ps in the container pid namespace. Most notably make sure podman and
// ps are read only to prevent a process from overwriting it.
func podmanTopInner() error {
if len(os.Args) < 4 {
return fmt.Errorf("internal error, need at least three arguments")
}
// We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG
// Also do not unlock as this thread should not be reused by go we exit anyway at the end.
runtime.LockOSThread()
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_PDEATHSIG: %w", err)
}
if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_DUMPABLE: %w", err)
}
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err)
}
if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil {
return fmt.Errorf("make / mount private: %w", err)
}
psPath := os.Args[3]
// try to mount everything read only
if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{
Attr_set: unix.MOUNT_ATTR_RDONLY,
}); err != nil {
if err != unix.ENOSYS {
return fmt.Errorf("mount_setattr / readonly: %w", err)
}
// old kernel without mount_setattr, i.e. on RHEL 8.8
// Bind mount the directories readonly for both podman and ps.
psPath, err = remountReadOnly(psPath)
if err != nil {
return err
}
_, err = remountReadOnly(reexec.Self())
if err != nil {
return err
}
}
// extra safety check make sure the ps path is actually read only
err := unix.Access(psPath, unix.W_OK)
if err == nil {
return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath)
}
pid := os.Args[1]
// join the pid namespace of pid
pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid))
if err != nil {
return fmt.Errorf("open pidns: %w", err)
}
if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil {
return fmt.Errorf("setns NEWPID: %w", err)
}
pidFD.Close()
userns := os.Args[2]
if userns == "1" {
C.set_userns()
}
args := []string{psPath}
args = append(args, os.Args[4:]...)
C.create_argv(C.int(len(args)))
for i, arg := range args {
cArg := C.CString(arg)
C.set_argv(C.int(i), cArg)
defer C.free(unsafe.Pointer(cArg))
}
// Now try to close open fds except std streams
// While golang open everything O_CLOEXEC it could still leak fds from
// the parent, i.e. bash. In this case an attacker might be able to
// read/write from them.
// Do this as last step, it has to happen before to fork because the child
// will be immediately in pid namespace so we cannot close them in the child.
entries, err := os.ReadDir("/proc/self/fd")
if err != nil {
return err
}
for _, e := range entries {
i, err := strconv.Atoi(e.Name())
// IsFdInherited checks the we got the fd from a parent process and only close them,
// when we close all that would include the ones from the go runtime which
// then can panic because of that.
if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) {
_ = unix.Close(i)
}
}
// this function will always exit for us
C.fork_exec_ps()
return nil
}
// remountReadOnly remounts the parent directory of the given path read only
// return the resolved path or an error. The path can then be used to exec the
// binary as we know it is on a read only mount now.
func remountReadOnly(path string) (string, error) {
resolvedPath, err := filepath.EvalSymlinks(path)
if err != nil {
return "", fmt.Errorf("resolve symlink for %s: %w", path, err)
}
dir := filepath.Dir(resolvedPath)
// create mount point
if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil {
return "", fmt.Errorf("mount %s read only: %w", dir, err)
}
// remount readonly
if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
return "", fmt.Errorf("mount %s read only: %w", dir, err)
}
return resolvedPath, nil
}
// Top gathers statistics about the running processes in a container. It returns a
// []string for output
func (c *Container) Top(descriptors []string) ([]string, error) {
if c.config.NoCgroups {
return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups)
}
conStat, err := c.State()
if err != nil {
return nil, fmt.Errorf("unable to look up state for %s: %w", c.ID(), err)
}
if conStat != define.ContainerStateRunning {
return nil, errors.New("top can only be used on running containers")
}
// Also support comma-separated input.
psgoDescriptors := []string{}
for _, d := range descriptors {
for _, s := range strings.Split(d, ",") {
if s != "" {
psgoDescriptors = append(psgoDescriptors, s)
}
}
}
// If we encountered an ErrUnknownDescriptor error, fallback to executing
// ps(1). This ensures backwards compatibility to users depending on ps(1)
// and makes sure we're ~compatible with docker.
output, psgoErr := c.GetContainerPidInformation(psgoDescriptors)
if psgoErr == nil {
return output, nil
}
if !errors.Is(psgoErr, psgo.ErrUnknownDescriptor) {
return nil, psgoErr
}
psDescriptors := descriptors
if len(descriptors) == 1 {
// Note that the descriptors to ps(1) must be shlexed (see #12452).
psDescriptors = make([]string, 0, len(descriptors))
shSplit, err := shlex.Split(descriptors[0])
if err != nil {
return nil, fmt.Errorf("parsing ps args: %w", err)
}
for _, s := range shSplit {
if s != "" {
psDescriptors = append(psDescriptors, s)
}
}
}
// Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE,
// with it the container can access /proc/$pid/ files and potentially escape the container fs.
if c.config.Spec.Process.Capabilities != nil &&
!slices.Contains(c.config.Spec.Process.Capabilities.Effective, "CAP_SYS_PTRACE") {
var retry bool
output, retry, err = c.execPS(psDescriptors)
if err != nil {
if !retry {
return nil, err
}
logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err)
output, err = c.execPSinContainer(psDescriptors)
if err != nil {
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
}
}
} else {
output, err = c.execPSinContainer(psDescriptors)
if err != nil {
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
}
}
// Trick: filter the ps command from the output instead of
// checking/requiring PIDs in the output.
filtered := []string{}
cmd := strings.Join(descriptors, " ")
for _, line := range output {
if !strings.Contains(line, cmd) {
filtered = append(filtered, line)
}
}
return filtered, nil
}
// GetContainerPidInformation returns process-related data of all processes in
// the container. The output data can be controlled via the `descriptors`
// argument which expects format descriptors and supports all AIXformat
// descriptors of ps (1) plus some additional ones to for instance inspect the
// set of effective capabilities. Each element in the returned string slice
// is a tab-separated string.
//
// For more details, please refer to github.com/containers/psgo.
func (c *Container) GetContainerPidInformation(descriptors []string) ([]string, error) {
pid := strconv.Itoa(c.state.PID)
// NOTE: psgo returns a [][]string to give users the ability to apply
// filters on the data. We need to change the API here
// to return a [][]string if we want to make use of
// filtering.
opts := psgo.JoinNamespaceOpts{FillMappings: rootless.IsRootless()}
psgoOutput, err := psgo.JoinNamespaceAndProcessInfoWithOptions(pid, descriptors, &opts)
if err != nil {
return nil, err
}
res := []string{}
for _, out := range psgoOutput {
res = append(res, strings.Join(out, "\t"))
}
return res, nil
}
// execute ps(1) from the host within the container pid namespace
func (c *Container) execPS(psArgs []string) ([]string, bool, error) {
rPipe, wPipe, err := os.Pipe()
if err != nil {
return nil, false, err
}
defer rPipe.Close()
outErrChan := make(chan error)
stdout := []string{}
go func() {
defer close(outErrChan)
scanner := bufio.NewScanner(rPipe)
for scanner.Scan() {
stdout = append(stdout, scanner.Text())
}
if err := scanner.Err(); err != nil {
outErrChan <- err
}
}()
psPath, err := exec.LookPath("ps")
if err != nil {
wPipe.Close()
return nil, true, err
}
// see podmanTopInner()
userns := "0"
if len(c.config.IDMappings.UIDMap) > 0 {
userns = "1"
}
args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), userns, psPath}, psArgs...)
cmd := reexec.Command(args...)
cmd.SysProcAttr = &syscall.SysProcAttr{
Unshareflags: unix.CLONE_NEWNS,
}
var errBuf bytes.Buffer
cmd.Stdout = wPipe
cmd.Stderr = &errBuf
// nil means use current env so explicitly unset all, to not leak any sensitive env vars
cmd.Env = []string{fmt.Sprintf("HOME=%s", os.Getenv("HOME"))}
retryContainerExec := true
err = cmd.Run()
wPipe.Close()
if err != nil {
exitError := &exec.ExitError{}
if errors.As(err, &exitError) {
if exitError.ExitCode() != podmanTopExitCode {
// ps command failed
err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String())
// ps command itself failed: likely invalid args, no point in retrying.
retryContainerExec = false
} else {
// podman-top reexec setup fails somewhere
err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String())
}
} else {
err = fmt.Errorf("could not reexec podman-top command: %w", err)
}
}
if err := <-outErrChan; err != nil {
return nil, retryContainerExec, fmt.Errorf("failed to read ps stdout: %w", err)
}
return stdout, retryContainerExec, err
}
// execPS executes ps(1) with the specified args in the container via exec session.
// This should be a bit safer then execPS() but it requires ps(1) to be installed in the container.
func (c *Container) execPSinContainer(args []string) ([]string, error) {
rPipe, wPipe, err := os.Pipe()
if err != nil {
return nil, err
}
defer rPipe.Close()
var errBuf bytes.Buffer
streams := new(define.AttachStreams)
streams.OutputStream = wPipe
streams.ErrorStream = &errBuf
streams.AttachOutput = true
streams.AttachError = true
outErrChan := make(chan error)
stdout := []string{}
go func() {
defer close(outErrChan)
scanner := bufio.NewScanner(rPipe)
for scanner.Scan() {
stdout = append(stdout, scanner.Text())
}
if err := scanner.Err(); err != nil {
outErrChan <- err
}
}()
cmd := append([]string{"ps"}, args...)
config := new(ExecConfig)
config.Command = cmd
ec, err := c.Exec(config, streams, nil)
wPipe.Close()
if err != nil {
return nil, err
} else if ec != 0 {
return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String())
}
if logrus.GetLevel() >= logrus.DebugLevel {
// If we're running in debug mode or higher, we might want to have a
// look at stderr which includes debug logs from conmon.
logrus.Debug(errBuf.String())
}
if err := <-outErrChan; err != nil {
return nil, fmt.Errorf("failed to read ps stdout: %w", err)
}
return stdout, nil
}