Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch set to support systemd as PID1 in container #13525

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions api/client/stop.go
Expand Up @@ -12,6 +12,7 @@ import (
// CmdStop stops one or more running containers.
//
// A running container is stopped by first sending SIGTERM and then SIGKILL if the container fails to stop within a grace period (the default is 10 seconds).
// If the container is running in Init="systemd" mode, docker will send the SIGRTMIN+3 signal
//
// Usage: docker stop [OPTIONS] CONTAINER [CONTAINER...]
func (cli *DockerCli) CmdStop(args ...string) error {
Expand Down
5 changes: 5 additions & 0 deletions contrib/completion/bash/docker
Expand Up @@ -951,6 +951,7 @@ _docker_run() {
--expose
--group-add
--hostname -h
--init
--ipc
--label -l
--label-file
Expand Down Expand Up @@ -1034,6 +1035,10 @@ _docker_run() {
compopt -o nospace
return
;;
--init)
COMPREPLY=( $( compgen -W 'systemd' -- "$cur" ) )
return
;;
--ipc)
case "$cur" in
*:*)
Expand Down
2 changes: 2 additions & 0 deletions contrib/completion/fish/docker.fish
Expand Up @@ -131,6 +131,7 @@ complete -c docker -A -f -n '__fish_seen_subcommand_from create' -l group-add -d
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -s h -l hostname -d 'Container host name'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -l help -d 'Print usage'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -s i -l interactive -d 'Keep STDIN open even if not attached'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -s a -l init -d 'Init to systemd.'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -l ipc -d 'Default is to create a private IPC namespace (POSIX SysV IPC) for the container'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -l link -d 'Add link to another container in the form of <name|id>:alias'
complete -c docker -A -f -n '__fish_seen_subcommand_from create' -l lxc-conf -d '(lxc exec-driver only) Add custom lxc options --lxc-conf="lxc.cgroup.cpuset.cpus = 0,1"'
Expand Down Expand Up @@ -300,6 +301,7 @@ complete -c docker -A -f -n '__fish_seen_subcommand_from rmi' -a '(__fish_print_
# run
complete -c docker -f -n '__fish_docker_no_subcommand' -a run -d 'Run a command in a new container'
complete -c docker -A -f -n '__fish_seen_subcommand_from run' -s a -l attach -d 'Attach to STDIN, STDOUT or STDERR.'
complete -c docker -A -f -n '__fish_seen_subcommand_from run' -s a -l init -d 'Init to systemd.'
complete -c docker -A -f -n '__fish_seen_subcommand_from run' -l add-host -d 'Add a custom host-to-IP mapping (host:ip)'
complete -c docker -A -f -n '__fish_seen_subcommand_from run' -s c -l cpu-shares -d 'CPU shares (relative weight)'
complete -c docker -A -f -n '__fish_seen_subcommand_from run' -l cap-add -d 'Add Linux capabilities'
Expand Down
10 changes: 8 additions & 2 deletions daemon/container.go
Expand Up @@ -477,8 +477,14 @@ func (container *Container) Stop(seconds int) error {
}

// 1. Send a SIGTERM
if err := container.killPossiblyDeadProcess(15); err != nil {
logrus.Infof("Failed to send SIGTERM to the process, force killing")
sig := 15
sigString := "SIGTERM"
if container.Config.Init == "systemd" {
sig = 35 //signal.
sigString = "SIGRTMIN + 3"
}
if err := container.killPossiblyDeadProcess(sig); err != nil {
logrus.Infof(fmt.Sprintf("Failed to send %s to the process, force killing", sigString))
if err := container.killPossiblyDeadProcess(9); err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions daemon/container_unix.go
Expand Up @@ -146,6 +146,7 @@ func (container *Container) createDaemonEnvironment(linkedEnv []string) []string
// because the env on the container can override certain default values
// we need to replace the 'env' keys where they match and append anything
// else.
env = append(env, fmt.Sprintf("container_uuid=%s", convertUUID(container.ID)))
env = utils.ReplaceOrAppendEnvValues(env, container.Config.Env)

return env
Expand Down
6 changes: 6 additions & 0 deletions daemon/delete.go
Expand Up @@ -123,6 +123,12 @@ func (daemon *Daemon) rm(container *Container, forceRemove bool) (err error) {
}
}

if path := journalPath(container.ID); path != "" {
if err = os.RemoveAll(path); err != nil {
return fmt.Errorf("Unable to remove journal content %v: %v", container.ID, err)
}
}

if err = os.RemoveAll(container.root); err != nil {
return fmt.Errorf("Unable to remove filesystem for %v: %v", container.ID, err)
}
Expand Down
1 change: 1 addition & 0 deletions daemon/execdriver/driver.go
Expand Up @@ -177,4 +177,5 @@ type Command struct {
FirstStart bool `json:"first_start"`
LayerPaths []string `json:"layer_paths"` // Windows needs to know the layer paths and folder for a command
LayerFolder string `json:"layer_folder"`
TmpDir string `json:"tmpdir"` // Directory used to store docker tmpdirs.
}
44 changes: 44 additions & 0 deletions daemon/execdriver/native/create.go
Expand Up @@ -6,6 +6,8 @@ import (
"errors"
"fmt"
"net"
"os"
"path/filepath"
"strings"
"syscall"

Expand Down Expand Up @@ -223,6 +225,34 @@ func (d *driver) setupRlimits(container *configs.Config, c *execdriver.Command)
}
}

func (d *driver) genPremountCmd(c *execdriver.Command, fullDest string, dest string) []configs.Command {
var premount []configs.Command
tarFile := fmt.Sprintf("%s/%s.tar", c.TmpDir, strings.Replace(dest, "/", "_", -1))
if _, err := os.Stat(fullDest); err == nil {
premount = append(premount, configs.Command{
Path: "/usr/bin/tar",
Args: []string{"-cf", tarFile, "-C", fullDest, "."},
})
}
return premount
}

func (d *driver) genPostmountCmd(c *execdriver.Command, fullDest string, dest string) []configs.Command {
var postmount []configs.Command
if _, err := os.Stat(fullDest); os.IsNotExist(err) {
return postmount
}
tarFile := fmt.Sprintf("%s/%s.tar", c.TmpDir, strings.Replace(dest, "/", "_", -1))
postmount = append(postmount, configs.Command{
Path: "/usr/bin/tar",
Args: []string{"-xf", tarFile, "-C", fullDest, "."},
})
return append(postmount, configs.Command{
Path: "/usr/bin/rm",
Args: []string{"-f", tarFile},
})
}

func (d *driver) setupMounts(container *configs.Config, c *execdriver.Command) error {
userMounts := make(map[string]struct{})
for _, m := range c.Mounts {
Expand All @@ -243,6 +273,20 @@ func (d *driver) setupMounts(container *configs.Config, c *execdriver.Command) e
container.Mounts = defaultMounts

for _, m := range c.Mounts {
if m.Source == "tmpfs" {
dest := filepath.Join(c.Rootfs, m.Destination)
flags := syscall.MS_NOSUID | syscall.MS_NODEV
container.Mounts = append(container.Mounts, &configs.Mount{
Source: m.Source,
Destination: m.Destination,
Device: "tmpfs",
Data: "mode=755,size=65536k",
Flags: flags,
PremountCmds: d.genPremountCmd(c, dest, m.Destination),
PostmountCmds: d.genPostmountCmd(c, dest, m.Destination),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what situation would this make sense? How can anything useful and worth preserving be in /run at mount time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would just allow the image developer to prepopulate the /run directory so that when the container starts up the content is still in the /run directory. If that is what you are asking.

})
continue
}
flags := syscall.MS_BIND | syscall.MS_REC
if !m.Writable {
flags |= syscall.MS_RDONLY
Expand Down
8 changes: 8 additions & 0 deletions daemon/execdriver/native/driver.go
Expand Up @@ -5,6 +5,7 @@ package native
import (
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
Expand Down Expand Up @@ -112,6 +113,13 @@ type execOutput struct {

func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (execdriver.ExitStatus, error) {
// take the Command and populate the libcontainer.Config from it
var err error
c.TmpDir, err = ioutil.TempDir("", c.ID)
if err != nil {
return execdriver.ExitStatus{ExitCode: -1}, err
}
defer os.RemoveAll(c.TmpDir)

container, err := d.createContainer(c)
if err != nil {
return execdriver.ExitStatus{ExitCode: -1}, err
Expand Down
14 changes: 14 additions & 0 deletions daemon/utils_unix.go
Expand Up @@ -5,6 +5,7 @@ package daemon
import (
"errors"
"fmt"
"os"
"strings"

"github.com/docker/docker/runconfig"
Expand Down Expand Up @@ -46,3 +47,16 @@ func mergeLxcConfIntoOptions(hostConfig *runconfig.HostConfig) ([]string, error)

return out, nil
}

func convertUUID(id string) string {
return fmt.Sprintf("%s-%s-%s-%s-%.12s", id[0:8], id[8:12], id[12:16], id[16:20], id[20:])
}

func journalPath(id string) string {
finfo, err := os.Stat("/var/log/journal")
if err != nil || !finfo.IsDir() {
return ""
}

return fmt.Sprintf("/var/log/journal/%.32s", id)
}
35 changes: 35 additions & 0 deletions daemon/volumes_linux.go
Expand Up @@ -51,6 +51,31 @@ func (container *Container) setupMounts() ([]execdriver.Mount, error) {
}
}

if container.Config.Init == "systemd" {
if container.MountPoints["/run"] == nil {
mounts = append(mounts, execdriver.Mount{Source: "tmpfs", Destination: "/run", Writable: true, Private: true})
}

if container.MountPoints["/sys"] == nil &&
container.MountPoints["/sys/fs"] == nil &&
container.MountPoints["/sys/fs/cgroup"] == nil {
mounts = append(mounts, execdriver.Mount{Source: "/sys/fs/cgroup", Destination: "/sys/fs/cgroup", Writable: false, Private: true})
}

if container.MountPoints["/var"] == nil &&
container.MountPoints["/var/log"] == nil &&
container.MountPoints["/var/log/journal"] == nil {
if journalPath, err := container.setupJournal(); err != nil {
return nil, err
} else {
if journalPath != "" {
label.Relabel(journalPath, container.MountLabel, "Z")
mounts = append(mounts, execdriver.Mount{Source: journalPath, Destination: journalPath, Writable: true, Private: true})
}
}
}
}

mounts = sortMounts(mounts)
return append(mounts, container.networkMounts()...), nil
}
Expand Down Expand Up @@ -398,3 +423,13 @@ func parseVolumeSource(spec string) (string, string, error) {
func (m *mountPoint) BackwardsCompatible() bool {
return len(m.Source) > 0 || m.Driver == volume.DefaultDriverName
}

func (container *Container) setupJournal() (string, error) {
path := journalPath(container.ID)
if path != "" {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
}
return path, nil
}
13 changes: 13 additions & 0 deletions docs/reference/run.md
Expand Up @@ -66,6 +66,7 @@ following options.
- [Container Identification](#container-identification)
- [Name (--name)](#name-name)
- [PID Equivalent](#pid-equivalent)
- [INIT Settings (--init)](#ipc-settings-ipc)
- [IPC Settings (--ipc)](#ipc-settings-ipc)
- [Network Settings](#network-settings)
- [Restart Policies (--restart)](#restart-policies-restart)
Expand Down Expand Up @@ -201,6 +202,17 @@ more advanced use case would be changing the host's hostname from a container.
> **Note**: `--uts="host"` gives the container full access to change the
> hostname of the host and is therefore considered insecure.

## INIT settings (--init)

--init="" : Enable a pre-configured profile for running init systems within containers.
'systemd': Changes the way docker runs a container, based on the systemd container specification.
* Mounts "/run" as a tmpfs,
* mounts /sys/fs/cgroup into the container as a read/only volume
* Adds container_uuid environment variable.
* Sets up volume mount /var/log/journald/UUID. Allowing journald data within the container to be seen by the host journalctl.

Default: No profile is enabled.

## IPC settings (--ipc)

--ipc="" : Set the IPC mode for the container,
Expand Down Expand Up @@ -1067,6 +1079,7 @@ container by using one or more `-e` flags, even overriding those mentioned
above, or already defined by the developer with a Dockerfile `ENV`:

$ docker run -e "deep=purple" --rm ubuntu /bin/bash -c export
declare -x container_uuid="be84194d-87f9-08c2-b2e1-67311f4409f5"
declare -x HOME="/"
declare -x HOSTNAME="85bc26a0e200"
declare -x OLDPWD
Expand Down
12 changes: 12 additions & 0 deletions man/docker-create.1.md
Expand Up @@ -29,6 +29,7 @@ docker-create - Create a new container
[**-h**|**--hostname**[=*HOSTNAME*]]
[**--help**]
[**-i**|**--interactive**[=*false*]]
[**--init**[=*INITSYSTEM*]]
[**--ipc**[=*IPC*]]
[**-l**|**--label**[=*[]*]]
[**--label-file**[=*[]*]]
Expand Down Expand Up @@ -143,6 +144,17 @@ two memory nodes.
**-i**, **--interactive**=*true*|*false*
Keep STDIN open even if not attached. The default is *false*.

**--init**=""

Enable a pre-configured profile for running init systems within containers.
Default: No profile is enabled.

'systemd': Changes the way docker runs a container, based on the systemd container specification.
* Mounts "/run" as a tmpfs,
* mounts /sys/fs/cgroup into the container as a read/only volume
* Adds container_uuid environment variable.
* Sets up volume mount /var/log/journald/UUID. Allowing journald data within the container to be seen by the host journalctl.

**--ipc**=""
Default is to create a private IPC namespace (POSIX SysV IPC) for the container
'container:<name|id>': reuses another container shared memory, semaphores and message queues
Expand Down
16 changes: 16 additions & 0 deletions man/docker-run.1.md
Expand Up @@ -30,6 +30,7 @@ docker-run - Run a command in a new container
[**-h**|**--hostname**[=*HOSTNAME*]]
[**--help**]
[**-i**|**--interactive**[=*false*]]
[**--init**[=*INITSYSTEM*]]
[**--ipc**[=*IPC*]]
[**-l**|**--label**[=*[]*]]
[**--label-file**[=*[]*]]
Expand Down Expand Up @@ -198,6 +199,9 @@ is the case the **--dns** flags is necessary for every run.
environment variables that are available for the process that will be launched
inside of the container.

The container_uuid is set automatically with a 32 character truncated
Container ID in standard UUID format.

**--entrypoint**=""
Overwrite the default ENTRYPOINT of the image

Expand Down Expand Up @@ -234,6 +238,17 @@ ENTRYPOINT.

When set to true, keep stdin open even if not attached. The default is false.

**--init**=""

Enable a pre-configured profile for running init systems within containers.
Default: No profile is enabled.

'systemd': Changes the way docker runs a container, based on the systemd container specification.
* Mounts "/run" as a tmpfs,
* mounts /sys/fs/cgroup into the container as a read/only volume
* Adds container_uuid environment variable.
* Sets up volume mount /var/log/journald/UUID. Allowing journald data within the container to be seen by the host journalctl.

**--ipc**=""
Default is to create a private IPC namespace (POSIX SysV IPC) for the container
'container:<name|id>': reuses another container shared memory, semaphores and message queues
Expand Down Expand Up @@ -565,6 +580,7 @@ Running the **env** command in the linker container shows environment variables
with the LT (alias) context (**LT_**)

# env
container_uuid=be84194d-87f9-08c2-b2e1-67311f4409f5
HOSTNAME=668231cb0978
TERM=xterm
LT_PORT_80_TCP=tcp://172.17.0.3:80
Expand Down