Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions npm/http/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ func NPMRestServerListenAndServe(config npmconfig.Config, npmEncoder json.Marsha
rs.router.Handle(api.ClusterMetricsPath, metrics.GetHandler(metrics.ClusterMetrics))
}

if config.Toggles.EnableHTTPDebugAPI && npmEncoder != nil {
// ACN CLI debug handlerss
// TODO support the debug CLI for v2
// the nil check is for fan-out npm
if config.Toggles.EnableHTTPDebugAPI && npmEncoder != nil && !config.Toggles.EnableV2NPM {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to disable GetCache for V2, this is useful irrespective of debug CLI.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently the cache is buggy for v2. Let's fix this in another PR

// ACN CLI debug handlers
rs.router.Handle(api.NPMMgrPath, rs.npmCacheHandler(npmEncoder)).Methods(http.MethodGet)
}

Expand Down
5 changes: 2 additions & 3 deletions npm/pkg/dataplane/policies/chain-management_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,9 @@ func isBaseChain(chain string) bool {
- delete all deprecated chains
- delete old v2 policy chains
3. Add/reposition the jump from FORWARD chain to AZURE-NPM chain.
TODO: add this jump (if necessary) in the iptables-restore call

TODO: could use one grep call instead of one for getting the jump line num and one for getting deprecated chains and old v2 policy chains
- should use a grep pattern like so: <line num...AZURE-NPM>|<Chain AZURE-NPM>
TODO: could use one grep call instead of separate calls for getting jump line nums and for getting deprecated chains and old v2 policy chains
- would use a grep pattern like so: <line num...AZURE-NPM>|<Chain AZURE-NPM>
*/
func (pMgr *PolicyManager) bootup(_ []string) error {
klog.Infof("booting up iptables Azure chains")
Expand Down
17 changes: 16 additions & 1 deletion npm/pkg/dataplane/policies/chain-management_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,20 @@ func TestBootupLinux(t *testing.T) {
calls: GetBootupTestCalls(),
wantErr: false,
},
{
name: "success after restore failure (no NPM prior)",
calls: []testutils.TestCmd{
{Cmd: []string{"iptables", "-w", "60", "-D", "FORWARD", "-j", "AZURE-NPM"}, ExitCode: 2}, // AZURE-NPM chain didn't exist
{Cmd: listAllCommandStrings, PipedToCommand: true},
{Cmd: []string{"grep", "Chain AZURE-NPM"}, ExitCode: 1},
fakeIPTablesRestoreFailureCommand, // e.g. xtables lock held by another app. Currently the stdout doesn't matter for retrying
fakeIPTablesRestoreCommand,
{Cmd: listLineNumbersCommandStrings, PipedToCommand: true},
{Cmd: []string{"grep", "AZURE-NPM"}, ExitCode: 1},
{Cmd: []string{"iptables", "-w", "60", "-I", "FORWARD", "-j", "AZURE-NPM", "-m", "conntrack", "--ctstate", "NEW"}},
},
wantErr: false,
},
{
name: "success: v2 existed prior",
calls: []testutils.TestCmd{
Expand Down Expand Up @@ -429,12 +443,13 @@ func TestBootupLinux(t *testing.T) {
wantErr: true,
},
{
name: "failure on restore (no NPM prior)",
name: "failure twice on restore (no NPM prior)",
calls: []testutils.TestCmd{
{Cmd: []string{"iptables", "-w", "60", "-D", "FORWARD", "-j", "AZURE-NPM"}, ExitCode: 2}, // AZURE-NPM chain didn't exist
{Cmd: listAllCommandStrings, PipedToCommand: true},
{Cmd: []string{"grep", "Chain AZURE-NPM"}, ExitCode: 1},
fakeIPTablesRestoreFailureCommand,
fakeIPTablesRestoreFailureCommand,
},
wantErr: true,
},
Expand Down
1 change: 1 addition & 0 deletions npm/pkg/dataplane/policies/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type NPMNetworkPolicy struct {
PolicyKey string
// PodSelectorIPSets holds all the IPSets generated from Pod Selector
PodSelectorIPSets []*ipsets.TranslatedIPSet
// TODO change to slice of pointers
// PodSelectorList holds target pod information to avoid duplicatoin in SrcList and DstList fields in ACLs
PodSelectorList []SetInfo
// RuleIPSets holds all IPSets generated from policy's rules
Expand Down
14 changes: 14 additions & 0 deletions npm/pkg/dataplane/policies/policy_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ type UniqueDirection bool
const (
forIngress UniqueDirection = true
forEgress UniqueDirection = false

// 5-6 elements depending on Included boolean
maxLengthForMatchSetSpecs = 6
)

// returns two booleans indicating whether the network policy has ingress and egress respectively
Expand Down Expand Up @@ -83,6 +86,17 @@ func (info SetInfo) comment() string {
return "!" + name
}

func (info SetInfo) matchSetSpecs(matchString string) []string {
specs := make([]string, 0, maxLengthForMatchSetSpecs)
specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag)
if !info.Included {
specs = append(specs, util.IptablesNotFlag)
}
hashedSetName := info.IPSet.GetHashedName()
specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString)
return specs
}

func (aclPolicy *ACLPolicy) comment() string {
// cleanPeerList contains peers that aren't namedports
var cleanPeerList []SetInfo
Expand Down
54 changes: 31 additions & 23 deletions npm/pkg/dataplane/policies/policymanager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,34 @@ import (
)

const (
maxTryCount = 1
unknownLineErrorPattern = "line (\\d+) failed" // TODO this could happen if syntax is off or AZURE-NPM-INGRESS doesn't exist for -A AZURE-NPM-INGRESS -j hash(NP1) ...
knownLineErrorPattern = "Error occurred at line: (\\d+)"

chainSectionPrefix = "chain" // TODO are sections necessary for error handling?
maxLengthForMatchSetSpecs = 6 // 5-6 elements depending on Included boolean
maxTryCount = 2
// the error message doesn't describe the error if this pattern is within the message
// this could happen if syntax is off or AZURE-NPM-INGRESS doesn't exist for -A AZURE-NPM-INGRESS -j hash(NP1) ...
unknownLineErrorPattern = "line (\\d+) failed"
// the error message describes the error if this pattern is within the message
knownLineErrorPattern = "Error occurred at line: (\\d+)"

chainSectionPrefix = "chain"
)

/*
Error handling for iptables-restore:
Currently we retry on any error and will make two tries max.
The section IDs and line error patterns are pointless currently. Although we can eventually use them to skip a section with an error and salvage the rest of the file.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Can you reduce this oneline into two, its too long.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressing in followup PR


Known errors that we should retry on:
- exit status 4
- iptables: Resource temporarily unavailable.
- fork/exec /usr/sbin/iptables: resource temporarily unavailable
- Another app is currently holding the xtables lock; still 51s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock; still 41s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock; still 31s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock; still 21s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock; still 11s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock; still 1s 0us time ahead to have a chance to grab the lock...
Another app is currently holding the xtables lock. Stopped waiting after 60s.
*/

func (pMgr *PolicyManager) addPolicy(networkPolicy *NPMNetworkPolicy, _ map[string]string) error {
// 1. Add rules for the network policies and activate NPM (if necessary).
chainsToCreate := chainNames([]*NPMNetworkPolicy{networkPolicy})
Expand Down Expand Up @@ -117,7 +137,7 @@ func (pMgr *PolicyManager) newCreatorWithChains(chainNames []string) *ioutil.Fil
sectionID := joinWithDash(chainSectionPrefix, chainName)
counters := "-"
creator.AddLine(sectionID, nil, ":"+chainName, "-", counters)
// TODO remove sections??
// TODO remove sections if we never use section-based error handling (e.g. remove the whole section)
}
return creator
}
Expand Down Expand Up @@ -154,10 +174,9 @@ func (pMgr *PolicyManager) deleteJumpRule(policy *NPMNetworkPolicy, direction Un

specs = append([]string{baseChainName}, specs...)
errCode, err := pMgr.runIPTablesCommand(util.IptablesDeletionFlag, specs...)
if err != nil && errCode != couldntLoadTargetErrorCode {
// TODO check rule doesn't exist error code instead because the chain should exist
if err != nil && errCode != doesNotExistErrorCode {
errorString := fmt.Sprintf("failed to delete jump from %s chain to %s chain for policy %s with exit code %d", baseChainName, chainName, policy.PolicyKey, errCode)
log.Errorf(errorString+": %w", err)
log.Errorf("%s: %w", errorString, err)
return npmerrors.SimpleErrorWrapper(errorString, err)
}
return nil
Expand Down Expand Up @@ -264,13 +283,7 @@ func matchSetSpecsForNetworkPolicy(networkPolicy *NPMNetworkPolicy, matchType Ma
specs := make([]string, 0, maxLengthForMatchSetSpecs*len(networkPolicy.PodSelectorList))
matchString := matchType.toIPTablesString()
for _, setInfo := range networkPolicy.PodSelectorList {
// TODO consolidate this code with that in matchSetSpecsFromSetInfo
specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag)
if !setInfo.Included {
specs = append(specs, util.IptablesNotFlag)
}
hashedSetName := setInfo.IPSet.GetHashedName()
specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString)
specs = append(specs, setInfo.matchSetSpecs(matchString)...)
}
return specs
}
Expand All @@ -279,12 +292,7 @@ func matchSetSpecsFromSetInfo(setInfoList []SetInfo) []string {
specs := make([]string, 0, maxLengthForMatchSetSpecs*len(setInfoList))
for _, setInfo := range setInfoList {
matchString := setInfo.MatchType.toIPTablesString()
specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag)
if !setInfo.Included {
specs = append(specs, util.IptablesNotFlag)
}
hashedSetName := setInfo.IPSet.GetHashedName()
specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString)
specs = append(specs, setInfo.matchSetSpecs(matchString)...)
}
return specs
}
Expand Down
28 changes: 25 additions & 3 deletions npm/pkg/dataplane/policies/policymanager_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,28 @@ func TestCreatorForRemovePolicies(t *testing.T) {
dptestutils.AssertEqualLines(t, expectedLines, actualLines)
}

// similar to TestRemovePolicy in policymanager.go except an error occurs
// similar to TestRemovePolicy in policymanager_test.go except an acceptable error occurs
func TestRemovePoliciesAcceptableError(t *testing.T) {
metrics.ReinitializeAll()
calls := []testutils.TestCmd{
fakeIPTablesRestoreCommand,
// ignore exit code 1
getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 1),
// ignore exit code 1
getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 1),
fakeIPTablesRestoreCommand,
}
ioshim := common.NewMockIOShim(calls)
defer ioshim.VerifyCalls(t, calls)
pMgr := NewPolicyManager(ioshim, ipsetConfig)
require.NoError(t, pMgr.AddPolicy(bothDirectionsNetPol, epList))
require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil))
_, ok := pMgr.GetPolicy(bothDirectionsNetPol.PolicyKey)
require.False(t, ok)
promVals{0, 1}.testPrometheusMetrics(t)
}

// similar to TestRemovePolicy in policymanager_test.go except an error occurs
func TestRemovePoliciesError(t *testing.T) {
tests := []struct {
name string
Expand All @@ -330,21 +351,22 @@ func TestRemovePoliciesError(t *testing.T) {
getFakeDeleteJumpCommand("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump),
getFakeDeleteJumpCommand("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump),
fakeIPTablesRestoreFailureCommand,
fakeIPTablesRestoreFailureCommand,
},
},
{
name: "error on delete for ingress",
calls: []testutils.TestCmd{
fakeIPTablesRestoreCommand,
getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 1), // anything but 0 or 2
getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 2), // anything but 0 or 1
},
},
{
name: "error on delete for egress",
calls: []testutils.TestCmd{
fakeIPTablesRestoreCommand,
getFakeDeleteJumpCommand("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump),
getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 1), // anything but 0 or 2
getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 2), // anything but 0 or 1
},
},
}
Expand Down
8 changes: 5 additions & 3 deletions npm/pkg/dataplane/policies/testutils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func GetAddPolicyTestCalls(_ *NPMNetworkPolicy) []testutils.TestCmd {
}

func GetAddPolicyFailureTestCalls(_ *NPMNetworkPolicy) []testutils.TestCmd {
return []testutils.TestCmd{fakeIPTablesRestoreFailureCommand}
return []testutils.TestCmd{fakeIPTablesRestoreFailureCommand, fakeIPTablesRestoreFailureCommand}
}

func GetRemovePolicyTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd {
Expand All @@ -44,8 +44,10 @@ func GetRemovePolicyTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd {
// GetRemovePolicyFailureTestCalls fails on the restore
func GetRemovePolicyFailureTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd {
calls := GetRemovePolicyTestCalls(policy)
calls[len(calls)-1] = fakeIPTablesRestoreFailureCommand // replace the restore success with a failure
return calls
// replace the restore success with a failure
calls[len(calls)-1] = fakeIPTablesRestoreFailureCommand
// add another failure
return append(calls, fakeIPTablesRestoreFailureCommand)
}

func GetBootupTestCalls() []testutils.TestCmd {
Expand Down