Skip to content

Commit

Permalink
Handle empty strings encoded in UTF16
Browse files Browse the repository at this point in the history
Fix #53
  • Loading branch information
n10v committed Mar 25, 2020
1 parent 3da5773 commit 051e4b2
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 4 deletions.
4 changes: 3 additions & 1 deletion buf_reader.go
Expand Up @@ -170,7 +170,9 @@ func (br *bufReader) ReadText(encoding Encoding) []byte {
text, br.err = br.readTillDelims(delims)

// See https://github.com/bogem/id3v2/issues/51.
if encoding.Equals(EncodingUTF16) {
if encoding.Equals(EncodingUTF16) &&
// See https://github.com/bogem/id3v2/issues/53#issuecomment-604038434.
!bytes.Equal(text, BOM) {
text = append(text, br.ReadByte())
}

Expand Down
38 changes: 38 additions & 0 deletions buf_reader_test.go
Expand Up @@ -52,6 +52,44 @@ func TestReadTillZero(t *testing.T) {
}
}

// TestReadTextUTF16WithLeadingEmptyString tests if string encoded in UTF16 with BOM
// with leading empty string with same encoding is read correctly.
//
// E.g. this can happen in comment frame with empty description and encoded in UTF16 with BOM.
//
// See https://github.com/bogem/id3v2/issues/53.
func TestReadTextUTF16WithLeadingEmptyString(t *testing.T) {
t.Parallel()

sampleText1 := append(BOM, EncodingUTF16.TerminationBytes...)

utf16C := []byte{0x43, 0x00} // "C" char in UTF-16.
sampleText2 := append(BOM, append(utf16C, EncodingUTF16.TerminationBytes...)...)

sampleText := append(sampleText1, sampleText2...)

bufReader := newBufReader(bytes.NewReader(sampleText))

text := decodeText(bufReader.ReadText(EncodingUTF16), EncodingUTF16)
if text != "" {
t.Errorf("Expected empty text, got: %v", text)
}
// bufReader should only read sampleText1, so Buffered() should return len of sampleText2.
if bufReader.buf.Buffered() != len(sampleText2) {
t.Errorf("Expected buffered: %v, got %v", len(sampleText2), bufReader.buf.Buffered())
}

text = decodeText(bufReader.ReadText(EncodingUTF16), EncodingUTF16)
utf8C := "C"
if text != utf8C {
t.Errorf("Expected text: %v, got: %v", utf8C, text)
}
// bufReader.buf should be empty, because it should read the whole sampleText.
if bufReader.buf.Buffered() != 0 {
t.Errorf("Expected buffered: 0, got %v", bufReader.buf.Buffered())
}
}

func TestNext(t *testing.T) {
t.Parallel()

Expand Down
14 changes: 12 additions & 2 deletions encoding.go
Expand Up @@ -86,6 +86,10 @@ var (
xencodingUTF8 = newXEncodingWrapper(unicode.UTF8)
)

// BOM is used in UTF-16 encoded Unicode with BOM.
// See https://en.wikipedia.org/wiki/Byte_order_mark.
var BOM = []byte{0xFF, 0xFE}

// getEncoding returns Encoding in accordance with ID3v2 key.
func getEncoding(key byte) Encoding {
if key > 3 {
Expand Down Expand Up @@ -115,14 +119,20 @@ func decodeText(src []byte, from Encoding) string {
return string(src)
}

// If src is just BOM, then it's an empty string.
if from.Equals(EncodingUTF16) && bytes.Equal(src, BOM) {
return ""
}

fromXEncoding := resolveXEncoding(src, from)
result, err := fromXEncoding.Decoder().Bytes(src)
if err != nil {
return string(src)
}

// HACK: Delete REPLACEMENT CHARACTER (�) if encoding went wrong.
// See https://apps.timwhitlock.info/unicode/inspect?s=%EF%BF%BD
// See https://apps.timwhitlock.info/unicode/inspect?s=%EF%BF%BD.
// See https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8.
if from.Equals(EncodingUTF16) {
// bytes.Replace(s, old, new, -1) is the same as bytes.ReplaceAll(s, old, new),
// but bytes.ReplaceAll is only added in Go 1.12.
Expand Down Expand Up @@ -159,7 +169,7 @@ func resolveXEncoding(src []byte, encoding Encoding) xencodingWrapper {
case 0:
return xencodingISO
case 1:
if len(src) > 2 && src[0] == 0xFF && src[1] == 0xFE {
if len(src) > 2 && bytes.Equal(src[:2], BOM) {
return xencodingUTF16LEBOM
}
return xencodingUTF16BEBOM
Expand Down
3 changes: 2 additions & 1 deletion encoding_test.go
Expand Up @@ -12,7 +12,8 @@ func TestDecodeText(t *testing.T) {
utf8 string
}{
{[]byte{0x48, 0xE9, 0x6C, 0x6C, 0xF6}, EncodingISO, "Héllö"},
{[]byte{0xFF, 0xFE, 0x48, 0x00, 0xE9, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0xF6, 0x00}, EncodingUTF16, "Héllö"}, // UTF-16LE with BOM
{[]byte{0xFF, 0xFE, 0x48, 0x00, 0xE9, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0xF6, 0x00}, EncodingUTF16, "Héllö"},
{[]byte{0xFF, 0xFE}, EncodingUTF16, ""},
{[]byte{0x00, 0x48, 0x00, 0xE9, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0xF6}, EncodingUTF16BE, "Héllö"},
}

Expand Down

0 comments on commit 051e4b2

Please sign in to comment.