Skip to content

Commit

Permalink
Pull request 1924: 6003-relax-rule-validation
Browse files Browse the repository at this point in the history
Updates #6003.

Squashed commit of the following:

commit 1874860
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:36:26 2023 +0300

    filtering/rulelist: imp test

commit 871a41a
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:10:35 2023 +0300

    filtering/rulelist: relax validation
  • Loading branch information
ainar-g committed Jul 13, 2023
1 parent f22d893 commit 2adc862
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 29 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Expand Up @@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.

### Fixed

- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
with lines over 1024 bytes long ([#6003]).
- `bufio.Scanner: token too long` and other errors when trying to add
filtering-rule lists with lines over 1024 bytes long or containing cosmetic
rules ([#6003]).

### Removed

Expand Down
35 changes: 14 additions & 21 deletions internal/filtering/rulelist/parser.go
Expand Up @@ -6,10 +6,9 @@ import (
"fmt"
"hash/crc32"
"io"
"unicode"
"unicode/utf8"

"github.com/AdguardTeam/golibs/errors"
"golang.org/x/exp/slices"
)

// Parser is a filtering-rule parser that collects data, such as the checksum
Expand Down Expand Up @@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
badIdx, isRule = p.parseLineTitle(trimmed)
}
if badIdx != -1 {
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])

return 0, fmt.Errorf(
"line %d: character %d: non-printable character %q",
"line %d: character %d: likely binary character %q",
lineNum,
badIdx+bytes.Index(line, trimmed)+1,
badRune,
trimmed[badIdx],
)
}

Expand Down Expand Up @@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
}

// parseLine returns true if the parsed line is a filtering rule. line is
// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of
// the first non-printable character, if any; if there are none, nonPrintIdx is
// -1.
// assumed to be trimmed of whitespace characters. badIdx is the index of the
// first character that may indicate that this is a binary file, or -1 if none.
//
// A line is considered a rule if it's not empty, not a comment, and contains
// only printable characters.
func parseLine(line []byte) (nonPrintIdx int, isRule bool) {
func parseLine(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
return -1, false
}

nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
badIdx = slices.IndexFunc(line, likelyBinary)

return nonPrintIdx, nonPrintIdx == -1
return badIdx, badIdx == -1
}

// isNotPrintable returns true if r is not a printable character that can be
// contained in a filtering rule.
func isNotPrintable(r rune) (ok bool) {
// Tab isn't included into Unicode's graphic symbols, so include it here
// explicitly.
return r != '\t' && !unicode.IsGraphic(r)
// likelyBinary returns true if b is likely to be a byte from a binary file.
func likelyBinary(b byte) (ok bool) {
return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
}

// parseLineTitle is like [parseLine] but additionally looks for a title. line
// is assumed to be trimmed of whitespace characters.
func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) {
func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' {
return -1, false
}

if line[0] != '!' {
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
badIdx = slices.IndexFunc(line, likelyBinary)

return nonPrintIdx, nonPrintIdx == -1
return badIdx, badIdx == -1
}

const titlePattern = "! Title: "
Expand Down
26 changes: 22 additions & 4 deletions internal/filtering/rulelist/parser_test.go
Expand Up @@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
wantTitle: "Test Title",
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
}, {
name: "cosmetic_with_zwnj",
in: testRuleTextCosmetic,
wantDst: testRuleTextCosmetic,
wantErrMsg: "",
wantTitle: "",
wantRulesNum: 1,
wantWritten: len(testRuleTextCosmetic),
}, {
name: "bad_char",
in: "! Title: Test Title \n" +
Expand All @@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
wantDst: testRuleTextBlocked,
wantErrMsg: "line 3: " +
"character 4: " +
"non-printable character '\\x7f'",
"likely binary character '\\x7f'",
wantTitle: "Test Title",
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
Expand Down Expand Up @@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {

require.NoError(b, errSink)
require.NotNil(b, resSink)

// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
//
// goos: linux
// goarch: amd64
// pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
// cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
// BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op
}

func FuzzParser_Parse(f *testing.F) {
Expand All @@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
"! Comment",
"! Title ",
"! Title XXX",
testRuleTextBadTab,
testRuleTextBlocked,
testRuleTextCosmetic,
testRuleTextEtcHostsTab,
testRuleTextHTML,
testRuleTextBlocked,
testRuleTextBadTab,
"1.2.3.4",
"1.2.3.4 etc-hosts.example",
">>>\x00<<<",
">>>\x7F<<<",
strings.Repeat("a", n+1),
strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
strings.Repeat("a", bufio.MaxScanTokenSize+1),
}

for _, tc := range testCases {
Expand Down
9 changes: 7 additions & 2 deletions internal/filtering/rulelist/rulelist_test.go
Expand Up @@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second

// Common texts for tests.
const (
testRuleTextHTML = "<!DOCTYPE html>\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
testRuleTextHTML = "<!DOCTYPE html>\n"

// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
)

0 comments on commit 2adc862

Please sign in to comment.