pkg/trace/obfuscate/sql_tokenizer.go

// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

package obfuscate

import (
	"bytes"
	"fmt"
	"unicode"
	"unicode/utf8"

	"github.com/DataDog/datadog-agent/pkg/trace/config"
)

// tokenizer.go implemenents a lexer-like iterator that tokenizes SQL and CQL
// strings, so that an external component can filter or alter each token of the
// string. This implementation can't be used as a real SQL lexer (so a parser
// cannot build the AST) because many rules are ignored to make the tokenizer
// simpler.
// This implementation was inspired by https://github.com/youtube/vitess sql parser
// TODO: add the license to the NOTICE file

// TokenKind specifies the type of the token being scanned. It may be one of the defined
// constants below or in some cases the actual rune itself.
type TokenKind uint32

// EndChar is used to signal that the scanner has finished reading the query. This happens when
// there are no more characters left in the query or when invalid encoding is discovered. EndChar
// is an invalid rune value that can not be found in any valid string.
const EndChar = unicode.MaxRune + 1

// list of available tokens; this list has been reduced because we don't
// need a full-fledged tokenizer to implement a Lexer
const (
	LexError = TokenKind(57346) + iota

	ID
	Limit
	Null
	String
	DoubleQuotedString
	DollarQuotedString // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
	DollarQuotedFunc   // a dollar-quoted string delimited by the tag "$func$"; gets special treatment when feature "dollar_quoted_func" is set
	Number
	BooleanLiteral
	ValueArg
	ListArg
	Comment
	Variable
	Savepoint
	PreparedStatement
	EscapeSequence
	NullSafeEqual
	LE
	GE
	NE
	As
	From
	Update
	Insert
	Into
	Join
	TableName
	ColonCast

	// FilteredGroupable specifies that the given token has been discarded by one of the
	// token filters and that it is groupable together with consecutive FilteredGroupable
	// tokens.
	FilteredGroupable

	// FilteredGroupableParenthesis is a parenthesis marked as filtered groupable. It is the
	// beginning of either a group of values ('(') or a nested query. We track is as
	// a special case for when it may start a nested query as opposed to just another
	// value group to be obfuscated.
	FilteredGroupableParenthesis

	// Filtered specifies that the token is a comma and was discarded by one
	// of the filters.
	Filtered

	// FilteredBracketedIdentifier specifies that we are currently discarding
	// a bracketed identifier (MSSQL).
	// See issue https://github.com/DataDog/datadog-trace-agent/issues/475.
	FilteredBracketedIdentifier
)

var tokenKindStrings = map[TokenKind]string{
	LexError:                     "LexError",
	ID:                           "ID",
	Limit:                        "Limit",
	Null:                         "Null",
	String:                       "String",
	DoubleQuotedString:           "DoubleQuotedString",
	DollarQuotedString:           "DollarQuotedString",
	DollarQuotedFunc:             "DollarQuotedFunc",
	Number:                       "Number",
	BooleanLiteral:               "BooleanLiteral",
	ValueArg:                     "ValueArg",
	ListArg:                      "ListArg",
	Comment:                      "Comment",
	Variable:                     "Variable",
	Savepoint:                    "Savepoint",
	PreparedStatement:            "PreparedStatement",
	EscapeSequence:               "EscapeSequence",
	NullSafeEqual:                "NullSafeEqual",
	LE:                           "LE",
	GE:                           "GE",
	NE:                           "NE",
	As:                           "As",
	From:                         "From",
	Update:                       "Update",
	Insert:                       "Insert",
	Into:                         "Into",
	Join:                         "Join",
	TableName:                    "TableName",
	ColonCast:                    "ColonCast",
	FilteredGroupable:            "FilteredGroupable",
	FilteredGroupableParenthesis: "FilteredGroupableParenthesis",
	Filtered:                     "Filtered",
	FilteredBracketedIdentifier:  "FilteredBracketedIdentifier",
}

func (k TokenKind) String() string {
	str, ok := tokenKindStrings[k]
	if !ok {
		return "<unknown>"
	}
	return str
}

const escapeCharacter = '\\'

// SQLTokenizer is the struct used to generate SQL
// tokens for the parser.
type SQLTokenizer struct {
	pos      int    // byte offset of lastChar
	lastChar rune   // last read rune
	buf      []byte // buf holds the query that we are parsing
	off      int    // off is the index into buf where the unread portion of the query begins.
	err      error  // any error occurred while reading

	curlys uint32 // number of active open curly braces in top-level SQL escape sequences.

	literalEscapes bool // indicates we should not treat backslashes as escape characters
	seenEscape     bool // indicates whether this tokenizer has seen an escape character within a string
}

// NewSQLTokenizer creates a new SQLTokenizer for the given SQL string. The literalEscapes argument specifies
// whether escape characters should be treated literally or as such.
func NewSQLTokenizer(sql string, literalEscapes bool) *SQLTokenizer {
	return &SQLTokenizer{
		buf:            []byte(sql),
		literalEscapes: literalEscapes,
	}
}

// Reset the underlying buffer and positions
func (tkn *SQLTokenizer) Reset(in string) {
	tkn.pos = 0
	tkn.lastChar = 0
	tkn.buf = []byte(in)
	tkn.off = 0
	tkn.err = nil
}

// keywords used to recognize string tokens
var keywords = map[string]TokenKind{
	"NULL":      Null,
	"TRUE":      BooleanLiteral,
	"FALSE":     BooleanLiteral,
	"SAVEPOINT": Savepoint,
	"LIMIT":     Limit,
	"AS":        As,
	"FROM":      From,
	"UPDATE":    Update,
	"INSERT":    Insert,
	"INTO":      Into,
	"JOIN":      Join,
}

// Err returns the last error that the tokenizer encountered, or nil.
func (tkn *SQLTokenizer) Err() error { return tkn.err }

func (tkn *SQLTokenizer) setErr(format string, args ...interface{}) {
	if tkn.err != nil {
		return
	}
	tkn.err = fmt.Errorf("at position %d: %v", tkn.pos, fmt.Errorf(format, args...))
}

// SeenEscape returns whether or not this tokenizer has seen an escape character within a scanned string
func (tkn *SQLTokenizer) SeenEscape() bool { return tkn.seenEscape }

// Scan scans the tokenizer for the next token and returns
// the token type and the token buffer.
func (tkn *SQLTokenizer) Scan() (TokenKind, []byte) {
	if tkn.lastChar == 0 {
		tkn.advance()
	}
	tkn.skipBlank()

	switch ch := tkn.lastChar; {
	case isLeadingLetter(ch):
		return tkn.scanIdentifier()
	case isDigit(ch):
		return tkn.scanNumber(false)
	default:
		tkn.advance()
		if tkn.lastChar == EndChar && tkn.err != nil {
			// advance discovered an invalid encoding. We should return early.
			return LexError, nil
		}
		switch ch {
		case EndChar:
			if tkn.err != nil {
				return LexError, nil
			}
			return EndChar, nil
		case ':':
			if tkn.lastChar == ':' {
				tkn.advance()
				return ColonCast, []byte("::")
			}
			if tkn.lastChar != '=' {
				return tkn.scanBindVar()
			}
			fallthrough
		case '=', ',', ';', '(', ')', '+', '*', '&', '|', '^', '~', '[', ']', '?':
			return TokenKind(ch), tkn.bytes()
		case '.':
			if isDigit(tkn.lastChar) {
				return tkn.scanNumber(true)
			}
			return TokenKind(ch), tkn.bytes()
		case '/':
			switch tkn.lastChar {
			case '/':
				tkn.advance()
				return tkn.scanCommentType1("//")
			case '*':
				tkn.advance()
				return tkn.scanCommentType2()
			default:
				return TokenKind(ch), tkn.bytes()
			}
		case '-':
			if tkn.lastChar == '-' {
				tkn.advance()
				return tkn.scanCommentType1("--")
			}
			return TokenKind(ch), tkn.bytes()
		case '#':
			tkn.advance()
			return tkn.scanCommentType1("#")
		case '<':
			switch tkn.lastChar {
			case '>':
				tkn.advance()
				return NE, []byte("<>")
			case '=':
				tkn.advance()
				switch tkn.lastChar {
				case '>':
					tkn.advance()
					return NullSafeEqual, []byte("<=>")
				default:
					return LE, []byte("<=")
				}
			default:
				return TokenKind(ch), tkn.bytes()
			}
		case '>':
			if tkn.lastChar == '=' {
				tkn.advance()
				return GE, []byte(">=")
			}
			return TokenKind(ch), tkn.bytes()
		case '!':
			if tkn.lastChar == '=' {
				tkn.advance()
				return NE, []byte("!=")
			}
			tkn.setErr(`expected "=" after "!", got "%c" (%d)`, tkn.lastChar, tkn.lastChar)
			return LexError, tkn.bytes()
		case '\'':
			return tkn.scanString(ch, String)
		case '"':
			return tkn.scanString(ch, DoubleQuotedString)
		case '`':
			return tkn.scanLiteralIdentifier('`')
		case '%':
			if tkn.lastChar == '(' {
				return tkn.scanVariableIdentifier('%')
			}
			if isLetter(tkn.lastChar) {
				// format parameter (e.g. '%s')
				return tkn.scanFormatParameter('%')
			}
			// modulo operator (e.g. 'id % 8')
			return TokenKind(ch), tkn.bytes()
		case '$':
			if isDigit(tkn.lastChar) {
				// TODO(gbbr): the first digit after $ does not necessarily guarantee
				// that this isn't a dollar-quoted string constant. We might eventually
				// want to cover for this use-case too (e.g. $1$some text$1$).
				return tkn.scanPreparedStatement('$')
			}
			kind, tok := tkn.scanDollarQuotedString()
			if kind == DollarQuotedFunc {
				// this is considered an embedded query, we should try and
				// obfuscate it
				out, err := attemptObfuscation(NewSQLTokenizer(string(tok), tkn.literalEscapes))
				if err != nil {
					// if we can't obfuscate it, treat it as a regular string
					return DollarQuotedString, tok
				}
				tok = append(append([]byte("$func$"), []byte(out.Query)...), []byte("$func$")...)
			}
			return kind, tok
		case '{':
			if tkn.pos == 1 || tkn.curlys > 0 {
				// Do not fully obfuscate top-level SQL escape sequences like {{[?=]call procedure-name[([parameter][,parameter]...)]}.
				// We want these to display a bit more context than just a plain '?'
				// See: https://docs.oracle.com/cd/E13157_01/wlevs/docs30/jdbc_drivers/sqlescape.html
				tkn.curlys++
				return TokenKind(ch), tkn.bytes()
			}
			return tkn.scanEscapeSequence('{')
		case '}':
			if tkn.curlys == 0 {
				// A closing curly brace has no place outside an in-progress top-level SQL escape sequence
				// started by the '{' switch-case.
				tkn.setErr(`unexpected byte %d`, ch)
				return LexError, tkn.bytes()
			}
			tkn.curlys--
			return TokenKind(ch), tkn.bytes()
		default:
			tkn.setErr(`unexpected byte %d`, ch)
			return LexError, tkn.bytes()
		}
	}
}

func (tkn *SQLTokenizer) skipBlank() {
	for unicode.IsSpace(tkn.lastChar) {
		tkn.advance()
	}
	tkn.bytes()
}

// toUpper is a modified version of bytes.ToUpper. It returns an upper-cased version of the byte
// slice src with all Unicode letters mapped to their upper case. It is modified to also accept a
// byte slice dst as an argument, the underlying storage of which (up to the capacity of dst)
// will be used as the destination of the upper-case copy of src, if it fits. As a special case,
// toUpper will return src if the byte slice is already upper-case. This function is used rather
// than bytes.ToUpper to improve the memory performance of the obfuscator by saving unnecessary
// allocations happening in bytes.ToUpper
func toUpper(src, dst []byte) []byte {
	dst = dst[:0]
	isASCII, hasLower := true, false
	for i := 0; i < len(src); i++ {
		c := src[i]
		if c >= utf8.RuneSelf {
			isASCII = false
			break
		}
		hasLower = hasLower || ('a' <= c && c <= 'z')
	}
	if cap(dst) < len(src) {
		dst = make([]byte, 0, len(src))
	}
	if isASCII { // optimize for ASCII-only byte slices.
		if !hasLower {
			// Just return src.
			return src
		}
		dst = dst[:len(src)]
		for i := 0; i < len(src); i++ {
			c := src[i]
			if 'a' <= c && c <= 'z' {
				c -= 'a' - 'A'
			}
			dst[i] = c
		}
		return dst
	}
	// This *could* be optimized, but it's an uncommon case.
	return bytes.Map(unicode.ToUpper, src)
}

func (tkn *SQLTokenizer) scanIdentifier() (TokenKind, []byte) {
	tkn.advance()
	for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' || tkn.lastChar == '*' {
		tkn.advance()
	}

	t := tkn.bytes()
	// Space allows us to upper-case identifiers 256 bytes long or less without allocating heap
	// storage for them, since space is allocated on the stack. A size of 256 bytes was chosen
	// based on the allowed length of sql identifiers in various sql implementations.
	var space [256]byte
	upper := toUpper(t, space[:0])
	if keywordID, found := keywords[string(upper)]; found {
		return keywordID, t
	}
	return ID, t
}

func (tkn *SQLTokenizer) scanLiteralIdentifier(quote rune) (TokenKind, []byte) {
	tkn.bytes() // throw away initial quote
	if !isLetter(tkn.lastChar) && !isDigit(tkn.lastChar) {
		tkn.setErr(`unexpected character "%c" (%d) in literal identifier`, tkn.lastChar, tkn.lastChar)
		return LexError, tkn.bytes()
	}
	for tkn.advance(); skipNonLiteralIdentifier(tkn.lastChar); tkn.advance() {
	}

	t := tkn.bytes()
	// literals identifier are enclosed between quotes
	if tkn.lastChar != quote {
		tkn.setErr(`literal identifiers must end in "%c", got "%c" (%d)`, quote, tkn.lastChar, tkn.lastChar)
		return LexError, t
	}
	tkn.advance()
	return ID, t
}

func (tkn *SQLTokenizer) scanVariableIdentifier(prefix rune) (TokenKind, []byte) {
	for tkn.advance(); tkn.lastChar != ')' && tkn.lastChar != EndChar; tkn.advance() {
	}
	tkn.advance()
	if !isLetter(tkn.lastChar) {
		tkn.setErr(`invalid character after variable identifier: "%c" (%d)`, tkn.lastChar, tkn.lastChar)
		return LexError, tkn.bytes()
	}
	tkn.advance()
	return Variable, tkn.bytes()
}

func (tkn *SQLTokenizer) scanFormatParameter(prefix rune) (TokenKind, []byte) {
	tkn.advance()
	return Variable, tkn.bytes()
}

// scanDollarQuotedString scans a Postgres dollar-quoted string constant.
// See: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
func (tkn *SQLTokenizer) scanDollarQuotedString() (TokenKind, []byte) {
	kind, tag := tkn.scanString('$', String)
	if kind == LexError {
		return kind, tkn.bytes()
	}
	var (
		got int
		buf bytes.Buffer
	)
	delim := tag
	// on empty strings, tkn.scanString returns the delimiters
	if string(delim) != "$$" {
		// on non-empty strings, the delimiter is $tag$
		delim = append([]byte{'$'}, delim...)
		delim = append(delim, '$')
	}
	for {
		ch := tkn.lastChar
		tkn.advance()
		if ch == EndChar {
			tkn.setErr("unexpected EOF in dollar-quoted string")
			return LexError, buf.Bytes()
		}
		if byte(ch) == delim[got] {
			got++
			if got == len(delim) {
				break
			}
			continue
		}
		if got > 0 {
			_, err := buf.Write(delim[:got])
			if err != nil {
				tkn.setErr("error reading dollar-quoted string: %v", err)
				return LexError, buf.Bytes()
			}
			got = 0
		}
		buf.WriteRune(ch)
	}
	if config.HasFeature("dollar_quoted_func") && string(delim) == "$func$" {
		// dolar_quoted_func: treat "$func" delimited dollar-quoted strings
		// differently and do not obfuscate them as a string
		return DollarQuotedFunc, buf.Bytes()
	}
	return DollarQuotedString, buf.Bytes()
}

func (tkn *SQLTokenizer) scanPreparedStatement(prefix rune) (TokenKind, []byte) {
	// a prepared statement expect a digit identifier like $1
	if !isDigit(tkn.lastChar) {
		tkn.setErr(`prepared statements must start with digits, got "%c" (%d)`, tkn.lastChar, tkn.lastChar)
		return LexError, tkn.bytes()
	}

	// scanNumber keeps the prefix rune intact.
	// read numbers and return an error if any
	token, buff := tkn.scanNumber(false)
	if token == LexError {
		tkn.setErr("invalid number")
		return LexError, tkn.bytes()
	}
	return PreparedStatement, buff
}

func (tkn *SQLTokenizer) scanEscapeSequence(braces rune) (TokenKind, []byte) {
	for tkn.lastChar != '}' && tkn.lastChar != EndChar {
		tkn.advance()
	}

	// we've reached the end of the string without finding
	// the closing curly braces
	if tkn.lastChar == EndChar {
		tkn.setErr("unexpected EOF in escape sequence")
		return LexError, tkn.bytes()
	}

	tkn.advance()
	return EscapeSequence, tkn.bytes()
}

func (tkn *SQLTokenizer) scanBindVar() (TokenKind, []byte) {
	token := ValueArg
	if tkn.lastChar == ':' {
		token = ListArg
		tkn.advance()
	}
	if !isLetter(tkn.lastChar) {
		tkn.setErr(`bind variables should start with letters, got "%c" (%d)`, tkn.lastChar, tkn.lastChar)
		return LexError, tkn.bytes()
	}
	for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' {
		tkn.advance()
	}
	return token, tkn.bytes()
}

func (tkn *SQLTokenizer) scanMantissa(base int) {
	for digitVal(tkn.lastChar) < base {
		tkn.advance()
	}
}

func (tkn *SQLTokenizer) scanNumber(seenDecimalPoint bool) (TokenKind, []byte) {
	if seenDecimalPoint {
		tkn.scanMantissa(10)
		goto exponent
	}

	if tkn.lastChar == '0' {
		// int or float
		tkn.advance()
		if tkn.lastChar == 'x' || tkn.lastChar == 'X' {
			// hexadecimal int
			tkn.advance()
			tkn.scanMantissa(16)
		} else {
			// octal int or float
			seenDecimalDigit := false
			tkn.scanMantissa(8)
			if tkn.lastChar == '8' || tkn.lastChar == '9' {
				// illegal octal int or float
				seenDecimalDigit = true
				tkn.scanMantissa(10)
			}
			if tkn.lastChar == '.' || tkn.lastChar == 'e' || tkn.lastChar == 'E' {
				goto fraction
			}
			// octal int
			if seenDecimalDigit {
				// tkn.setErr called in caller
				return LexError, tkn.bytes()
			}
		}
		goto exit
	}

	// decimal int or float
	tkn.scanMantissa(10)

fraction:
	if tkn.lastChar == '.' {
		tkn.advance()
		tkn.scanMantissa(10)
	}

exponent:
	if tkn.lastChar == 'e' || tkn.lastChar == 'E' {
		tkn.advance()
		if tkn.lastChar == '+' || tkn.lastChar == '-' {
			tkn.advance()
		}
		tkn.scanMantissa(10)
	}

exit:
	t := tkn.bytes()
	if len(t) == 0 {
		return LexError, nil
	}
	return Number, t
}

func (tkn *SQLTokenizer) scanString(delim rune, kind TokenKind) (TokenKind, []byte) {
	buf := bytes.NewBuffer(tkn.buf[:0])
	for {
		ch := tkn.lastChar
		tkn.advance()
		if ch == delim {
			if tkn.lastChar == delim {
				// doubling a delimiter is the default way to embed the delimiter within a string
				tkn.advance()
			} else {
				// a single delimiter denotes the end of the string
				break
			}
		} else if ch == escapeCharacter {
			tkn.seenEscape = true

			if !tkn.literalEscapes {
				// treat as an escape character
				ch = tkn.lastChar
				tkn.advance()
			}
		}
		if ch == EndChar {
			tkn.setErr("unexpected EOF in string")
			return LexError, buf.Bytes()
		}
		buf.WriteRune(ch)
	}
	if kind == ID && buf.Len() == 0 || bytes.IndexFunc(buf.Bytes(), func(r rune) bool { return !unicode.IsSpace(r) }) == -1 {
		// This string is an empty or white-space only identifier.
		// We should keep the start and end delimiters in order to
		// avoid creating invalid queries.
		// See: https://github.com/DataDog/datadog-trace-agent/issues/316
		return kind, append(runeBytes(delim), runeBytes(delim)...)
	}
	return kind, buf.Bytes()
}

func (tkn *SQLTokenizer) scanCommentType1(prefix string) (TokenKind, []byte) {
	for tkn.lastChar != EndChar {
		if tkn.lastChar == '\n' {
			tkn.advance()
			break
		}
		tkn.advance()
	}
	return Comment, tkn.bytes()
}

func (tkn *SQLTokenizer) scanCommentType2() (TokenKind, []byte) {
	for {
		if tkn.lastChar == '*' {
			tkn.advance()
			if tkn.lastChar == '/' {
				tkn.advance()
				break
			}
			continue
		}
		if tkn.lastChar == EndChar {
			tkn.setErr("unexpected EOF in comment")
			return LexError, tkn.bytes()
		}
		tkn.advance()
	}
	return Comment, tkn.bytes()
}

// advance advances the tokenizer to the next rune. If the decoder encounters an error decoding, or
// the end of the buffer is reached, tkn.lastChar will be set to EndChar. In case of a decoding
// error, tkn.err will also be set.
func (tkn *SQLTokenizer) advance() {
	ch, n := utf8.DecodeRune(tkn.buf[tkn.off:])
	if ch == utf8.RuneError && n < 2 {
		tkn.pos++
		tkn.lastChar = EndChar
		if n == 1 {
			tkn.setErr("invalid UTF-8 encoding beginning with 0x%x", tkn.buf[tkn.off])
		}
		return
	}
	if tkn.lastChar != 0 || tkn.pos > 0 {
		// we are past the first character
		tkn.pos += n
	}
	tkn.off += n
	tkn.lastChar = ch
}

// bytes returns all the bytes that were advanced over since its last call.
// This excludes tkn.lastChar, which will remain in the buffer
func (tkn *SQLTokenizer) bytes() []byte {
	if tkn.lastChar == EndChar {
		ret := tkn.buf[:tkn.off]
		tkn.buf = tkn.buf[tkn.off:]
		tkn.off = 0
		return ret
	}
	lastLen := utf8.RuneLen(tkn.lastChar)
	ret := tkn.buf[:tkn.off-lastLen]
	tkn.buf = tkn.buf[tkn.off-lastLen:]
	tkn.off = lastLen
	return ret
}

func skipNonLiteralIdentifier(ch rune) bool {
	return isLetter(ch) || isDigit(ch) || '.' == ch || '-' == ch
}

func isLeadingLetter(ch rune) bool {
	return unicode.IsLetter(ch) || ch == '_' || ch == '@'
}

func isLetter(ch rune) bool {
	return isLeadingLetter(ch) || ch == '#'
}

func digitVal(ch rune) int {
	switch {
	case '0' <= ch && ch <= '9':
		return int(ch) - '0'
	case 'a' <= ch && ch <= 'f':
		return int(ch) - 'a' + 10
	case 'A' <= ch && ch <= 'F':
		return int(ch) - 'A' + 10
	}
	return 16 // larger than any legal digit val
}

func isDigit(ch rune) bool { return '0' <= ch && ch <= '9' }

// runeBytes converts the given rune to a slice of bytes.
func runeBytes(r rune) []byte {
	buf := make([]byte, utf8.UTFMax)
	n := utf8.EncodeRune(buf, r)
	return buf[:n]
}