Skip to content

Commit

Permalink
Add per-endpoint sysctls to DriverOpts
Browse files Browse the repository at this point in the history
Until now it's been possible to set per-interface sysctls using, for
example, '--sysctl net.ipv6.conf.eth0.accept_ra=2'. But, the index in
the interface name is allocated serially, and the numbering in a container
with more than one interface may change when a container is restarted.
The change to make it possible to connect a container to more than one
network when it's created increased the ambiguity.

This change adds label "com.docker.network.endpoint.sysctls" to the
DriverOpts in EndpointSettings. This option is explicitly associated
with the interface.

Settings in "--sysctl" for "eth0" are migrated to DriverOpts.

Because using "--sysctl" with any interface apart from "eth0" would have
unpredictable results, it is now an error to use any other interface name
in the top level "--sysctl" option. The error message includes a hint at
how to use the new per-interface setting.

The per-endpoint sysctl name is a shortened form of the sysctl name,
intended to limit settings to 'net.*', and to eliminate the need to
identify the interface by name. For example:
    net.ipv6.conf.eth0.accept_ra=2
becomes:
    ipv6.conf.accept_ra=2

The value of DriverOpts["com.docker.network.endpoint.sysctls"] is a
comma separated list of these short-form sysctls.

Settings from '--sysctl' are applied by the runtime lib during task
creation. So, task creation fails if the endpoint does not exist.
Applying per-endpoint settings during interface configuration means the
endpoint can be created later, which paves the way for removal of the
SetKey OCI prestart hook.

Unlike other DriverOpts, the sysctl label itself is not driver-specific,
but each driver has a chance to check settings/values and raise an error
if a setting would cause it a problem - no such checks have been added
in this initial version. As a future extension, if required, it would be
possible for the driver to echo back valid/extended/modified settings to
libnetwork for it to apply to the interface. (At that point, the syntax
for the options could become driver specific to allow, for example, a
driver to create more than one interface).

Signed-off-by: Rob Murray <rob.murray@docker.com>
  • Loading branch information
robmry committed Apr 18, 2024
1 parent 8768d60 commit 5d0ab3f
Show file tree
Hide file tree
Showing 9 changed files with 298 additions and 3 deletions.
97 changes: 96 additions & 1 deletion api/server/router/container/container_routes.go
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/docker/docker/api/types/versions"
containerpkg "github.com/docker/docker/container"
"github.com/docker/docker/errdefs"
"github.com/docker/docker/libnetwork/netlabel"
"github.com/docker/docker/pkg/ioutils"
"github.com/docker/docker/runconfig"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
Expand Down Expand Up @@ -619,6 +620,12 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
warnings = append(warnings, warn)
}

if warn, err := handleSysctlBC(hostConfig, networkingConfig, version); err != nil {
return err
} else if warn != "" {
warnings = append(warnings, warn)
}

if hostConfig.PidsLimit != nil && *hostConfig.PidsLimit <= 0 {
// Don't set a limit if either no limit was specified, or "unlimited" was
// explicitly set.
Expand Down Expand Up @@ -675,7 +682,7 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf
return "", nil
}
var warning string
if hostConfig.NetworkMode.IsDefault() || hostConfig.NetworkMode.IsBridge() || hostConfig.NetworkMode.IsUserDefined() {
if hostConfig.NetworkMode.IsBridge() || hostConfig.NetworkMode.IsUserDefined() {
ep, err := epConfigForNetMode(version, hostConfig.NetworkMode, networkingConfig)
if err != nil {
return "", errors.Wrap(err, "unable to migrate container-wide MAC address to a specific network")
Expand All @@ -694,6 +701,94 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf
return warning, nil
}

// handleSysctlBC migrates top level network endpoint-specific '--sysctl'
// settings to an DriverOpts for an endpoint. This is necessary because sysctls
// are applied during container task creation, but sysctls that name an interface
// (for example 'net.ipv6.conf.eth0.forwarding') cannot be applied until the
// interface has been created. So, these settings are removed from hostConfig.Sysctls
// and added to DriverOpts[netlabel.EndpointSysctls].
//
// Because interface names ('ethN') are allocated sequentially, and the order of
// network connections is not deterministic on container restart, only 'eth0'
// would work reliably in a top-level '--sysctl' option, and then only when
// there's a single initial network connection. So, settings for 'eth0' are
// migrated to the primary interface, identified by 'hostConfig.NetworkMode'.
// Settings for other interfaces are treated as errors.
//
// In the DriverOpts, to restrict sysctl settings to those configuring network
// interfaces and because the interface name cannot be determined in advance, a
// short-form of the sysctl name is used. For example, 'net.ipv6.conf.eth0.forwarding'
// becomes 'ipv6.conf.forwarding'. The value in DriverOpts is a comma-separated list
// of these short-form settings.
//
// A warning is generated when settings are migrated.
func handleSysctlBC(
hostConfig *container.HostConfig,
netConfig *network.NetworkingConfig,
version string,
) (string, error) {
if !hostConfig.NetworkMode.IsPrivate() {
return "", nil
}

var ep *network.EndpointSettings
var toDelete []string
var netIfSysctls []string
for k, v := range hostConfig.Sysctls {
// If the sysctl name matches "net.*.*.eth0.*" ...
if spl := strings.SplitN(k, ".", 5); len(spl) == 5 && spl[0] == "net" && strings.HasPrefix(spl[3], "eth") {
// Transform the name to the endpoint-specific short form.
netIfSysctl := fmt.Sprintf("%s.%s.%s=%s", spl[1], spl[2], spl[4], v)
// Find the EndpointConfig to migrate settings to, if not already found.
if ep == nil {
var err error
ep, err = epConfigForNetMode(version, hostConfig.NetworkMode, netConfig)
if err != nil {
return "", fmt.Errorf("unable to find a network for sysctl %s: %w", k, err)
}
}
// Only try to migrate settings for "eth0", anything else would always
// have behaved unpredictably.
if spl[3] != "eth0" {
return "", fmt.Errorf(`unable to determine network endpoint for sysctl %s, use '--network=name=%s,sysctl=%s' or compose 'driver_opts: "%s":"%s"`,
k, hostConfig.NetworkMode.NetworkName(), netIfSysctl,
netlabel.EndpointSysctls, netIfSysctl)
}
// Prepare the migration.
toDelete = append(toDelete, k)
netIfSysctls = append(netIfSysctls, netIfSysctl)
}
}
if ep == nil {
return "", nil
}

// TODO(robmry) - refuse to do the migration, generate an error if API > some-future-version.

newDriverOpt := strings.Join(netIfSysctls, ",")
warning := fmt.Sprintf(`Migrated %s to DriverOpts{"%s":"%s"}. (Use "--network=name=%s,sysctl=%s", or compose "driver_opts".)`,
strings.Join(toDelete, ","),
netlabel.EndpointSysctls, newDriverOpt,
hostConfig.NetworkMode.NetworkName(), strings.Join(netIfSysctls, ",sysctl="))

// Append exiting per-endpoint sysctls to the migrated sysctls (give priority
// to per-endpoint settings).
if ep.DriverOpts == nil {
ep.DriverOpts = map[string]string{}
}
if oldDriverOpt, ok := ep.DriverOpts[netlabel.EndpointSysctls]; ok {
newDriverOpt += "," + oldDriverOpt
}
ep.DriverOpts[netlabel.EndpointSysctls] = newDriverOpt

// Delete migrated settings from the top-level sysctls.
for _, k := range toDelete {
delete(hostConfig.Sysctls, k)
}

return warning, nil
}

// epConfigForNetMode finds, or creates, an entry in netConfig.EndpointsConfig
// corresponding to nwMode.
//
Expand Down
84 changes: 84 additions & 0 deletions api/server/router/container/container_routes_test.go
@@ -1,10 +1,12 @@
package container

import (
"strings"
"testing"

"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/network"
"github.com/docker/docker/libnetwork/netlabel"
"gotest.tools/v3/assert"
is "gotest.tools/v3/assert/cmp"
)
Expand Down Expand Up @@ -228,3 +230,85 @@ func TestEpConfigForNetMode(t *testing.T) {
})
}
}

func TestHandleSysctlBC(t *testing.T) {
testcases := []struct {
name string
networkMode string
sysctls map[string]string
expEpSysctls []string
expSysctls map[string]string
expWarningContains []string
expError string
}{
{
name: "migrate to new ep",
networkMode: "mynet",
sysctls: map[string]string{
"net.ipv6.conf.all.disable_ipv6": "0",
"net.ipv6.conf.eth0.accept_ra": "2",
"net.ipv6.conf.eth0.forwarding": "1",
},
expSysctls: map[string]string{
"net.ipv6.conf.all.disable_ipv6": "0",
},
expEpSysctls: []string{"ipv6.conf.forwarding=1", "ipv6.conf.accept_ra=2"},
expWarningContains: []string{
"Migrated",
"net.ipv6.conf.eth0.accept_ra", "ipv6.conf.accept_ra=2",
"net.ipv6.conf.eth0.forwarding", "ipv6.conf.forwarding=1",
},
},
{
name: "migrate nothing",
networkMode: "mynet",
sysctls: map[string]string{
"net.ipv6.conf.all.disable_ipv6": "0",
},
expSysctls: map[string]string{
"net.ipv6.conf.all.disable_ipv6": "0",
},
},
}

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
hostCfg := &container.HostConfig{
NetworkMode: container.NetworkMode(tc.networkMode),
Sysctls: map[string]string{},
}
for k, v := range tc.sysctls {
hostCfg.Sysctls[k] = v
}
netCfg := &network.NetworkingConfig{}

warnings, err := handleSysctlBC(hostCfg, netCfg, "1.45")

if tc.expError == "" {
assert.Check(t, err)
} else {
assert.Check(t, is.ErrorContains(err, tc.expError))
}

for _, s := range tc.expWarningContains {
assert.Check(t, is.Contains(warnings, s))
}

assert.Check(t, is.DeepEqual(hostCfg.Sysctls, tc.expSysctls))

ep := netCfg.EndpointsConfig[tc.networkMode]
if ep == nil {
assert.Check(t, is.Nil(tc.expEpSysctls))
} else {
got, ok := ep.DriverOpts[netlabel.EndpointSysctls]
assert.Check(t, ok)
// Check for expected ep-sysctls.
for _, want := range tc.expEpSysctls {
assert.Check(t, is.Contains(got, want))
}
// Check for unexpected ep-sysctls.
assert.Check(t, is.Len(got, len(strings.Join(tc.expEpSysctls, ","))))
}
})
}
}
12 changes: 12 additions & 0 deletions daemon/container_operations.go
Expand Up @@ -614,6 +614,18 @@ func validateEndpointSettings(nw *libnetwork.Network, nwName string, epConfig *n
}
}

if sysctls, ok := epConfig.DriverOpts[netlabel.EndpointSysctls]; ok {
for _, sysctl := range strings.Split(sysctls, ",") {
scname := strings.SplitN(sysctl, ".", 3)
if len(scname) != 3 || (scname[0] != "ipv4" && scname[0] != "ipv6") {
errs = append(errs,
fmt.Errorf(
"unrecognised network interface sysctl '%s'; represent 'net.X.Y.ethN.Z=V' as 'X.Y.Z=V', 'X' must be 'ipv4' or 'ipv6'",
sysctl))
}
}
}

if err := multierror.Join(errs...); err != nil {
return fmt.Errorf("invalid endpoint settings:\n%w", err)
}
Expand Down
30 changes: 30 additions & 0 deletions integration/networking/bridge_test.go
Expand Up @@ -9,8 +9,10 @@ import (

"github.com/docker/docker/api/types"
containertypes "github.com/docker/docker/api/types/container"
apinetwork "github.com/docker/docker/api/types/network"
"github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/integration/internal/network"
"github.com/docker/docker/libnetwork/netlabel"
"github.com/docker/docker/testutil"
"github.com/docker/docker/testutil/daemon"
"github.com/google/go-cmp/cmp/cmpopts"
Expand Down Expand Up @@ -703,3 +705,31 @@ func TestSetInterfaceSysctl(t *testing.T) {
stdout := runRes.Stdout.String()
assert.Check(t, is.Contains(stdout, scName))
}

// Test that it's possible to set a sysctl on an interface in the container
// using DriverOpts.
func TestSetEndpointSysctl(t *testing.T) {
skip.If(t, testEnv.DaemonInfo.OSType == "windows", "no sysctl on Windows")

ctx := setupTest(t)
d := daemon.New(t)
d.StartWithBusybox(ctx, t)
defer d.Stop(t)

c := d.NewClientT(t)
defer c.Close()

const scName = "net.ipv4.conf.eth0.forwarding"
runRes := container.RunAttach(ctx, t, c,
container.WithCmd("sysctl", scName),
container.WithEndpointSettings(apinetwork.NetworkBridge, &apinetwork.EndpointSettings{
DriverOpts: map[string]string{
netlabel.EndpointSysctls: "ipv4.conf.forwarding=1",
},
}),
)
defer c.ContainerRemove(ctx, runRes.ContainerID, containertypes.RemoveOptions{Force: true})

stdout := runRes.Stdout.String()
assert.Check(t, is.Contains(stdout, scName))
}
13 changes: 13 additions & 0 deletions libnetwork/endpoint.go
Expand Up @@ -8,6 +8,7 @@ import (
"encoding/json"
"fmt"
"net"
"strings"
"sync"

"github.com/containerd/log"
Expand Down Expand Up @@ -400,6 +401,18 @@ func (ep *Endpoint) Value() []byte {
return b
}

func (ep *Endpoint) getSysctls() []string {
ep.mu.Lock()
defer ep.mu.Unlock()

if s, ok := ep.generic[netlabel.EndpointSysctls]; ok {
if ss, ok := s.(string); ok {
return strings.Split(ss, ",")
}
}
return nil
}

func (ep *Endpoint) SetValue(value []byte) error {
return json.Unmarshal(value, ep)
}
Expand Down
3 changes: 3 additions & 0 deletions libnetwork/netlabel/labels.go
Expand Up @@ -26,6 +26,9 @@ const (
// DNSServers A list of DNS servers associated with the endpoint
DNSServers = Prefix + ".endpoint.dnsservers"

// EndpointSysctls is a comma separated list of shortened sysctls ('net.X.Y.ethN.Z=V' -> 'X.Y.Z=V').
EndpointSysctls = Prefix + ".endpoint.sysctls"

// EnableIPv6 constant represents enabling IPV6 at network level
EnableIPv6 = Prefix + ".enable_ipv6"

Expand Down

0 comments on commit 5d0ab3f

Please sign in to comment.