Skip to content

Commit

Permalink
apacheGH-35190: [Go] Correctly handle null values in CSV reader (apac…
Browse files Browse the repository at this point in the history
…he#35191)

The way the CSV reader handles null values currently makes it impossible for empty strings to be interpreted as null, even when this is explicitly enabled through the `csv.WithNullReader(stringsCanBeNull: true, "", "NULL", "null")` option. This change fixes this.
* Closes: apache#35190

Authored-by: Herman Schaaf <hermanschaaf@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
hermanschaaf authored and ArgusLi committed May 15, 2023
1 parent 92e587f commit ad541bb
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 25 deletions.
2 changes: 1 addition & 1 deletion go/arrow/csv/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ func (r *Reader) parseList(field array.Builder, str string) {

func (r *Reader) parseBinaryType(field array.Builder, str string) {
// specialize the implementation when we know we cannot have nulls
if str != "" && r.isNull(str) {
if r.isNull(str) {
field.AppendNull()
return
}
Expand Down
65 changes: 43 additions & 22 deletions go/arrow/csv/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,28 +290,42 @@ func TestCSVReaderParseError(t *testing.T) {

func TestCSVReader(t *testing.T) {
tests := []struct {
Name string
File string
Header bool
}{{
Name: "NoHeader",
File: "testdata/types.csv",
Header: false,
}, {
Name: "Header",
File: "testdata/header.csv",
Header: true,
}}
Name string
File string
Header bool
StringsCanBeNull bool
}{
{
Name: "NoHeader",
File: "testdata/types.csv",
Header: false,
}, {
Name: "Header",
File: "testdata/header.csv",
Header: true,
},
{
Name: "NoHeader_StringsCanBeNull",
File: "testdata/types.csv",
Header: false,
StringsCanBeNull: true,
}, {
Name: "Header_StringsCanBeNull",
File: "testdata/header.csv",
Header: true,
StringsCanBeNull: true,
},
}
for _, test := range tests {
t.Run(test.Name, func(t *testing.T) {
testCSVReader(t, test.File, test.Header)
testCSVReader(t, test.File, test.Header, test.StringsCanBeNull)
})
}
}

var defaultNullValues = []string{"", "NULL", "null", "N/A"}

func testCSVReader(t *testing.T, filepath string, withHeader bool) {
func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeNull bool) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

Expand Down Expand Up @@ -345,7 +359,7 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool) {
csv.WithAllocator(mem),
csv.WithComment('#'), csv.WithComma(';'),
csv.WithHeader(withHeader),
csv.WithNullReader(true, defaultNullValues...),
csv.WithNullReader(stringsCanBeNull, defaultNullValues...),
)
defer r.Release()

Expand All @@ -372,7 +386,14 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool) {
t.Fatalf("invalid number of rows: got=%d, want=%d", got, want)
}

want := `rec[0]["bool"]: [true]
str1Value := `""`
str2Value := `"null"`
if stringsCanBeNull {
str1Value = "(null)"
str2Value = "(null)"
}

want := fmt.Sprintf(`rec[0]["bool"]: [true]
rec[0]["i8"]: [-1]
rec[0]["i16"]: [-1]
rec[0]["i32"]: [-1]
Expand All @@ -399,10 +420,10 @@ rec[1]["u32"]: [2]
rec[1]["u64"]: [2]
rec[1]["f32"]: [2.2]
rec[1]["f64"]: [2.2]
rec[1]["str"]: ["str-2"]
rec[1]["str"]: [%s]
rec[1]["ts"]: [1652140799000]
rec[1]["list(i64)"]: [[]]
rec[1]["binary"]: [""]
rec[1]["binary"]: [(null)]
rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"]
rec[2]["bool"]: [(null)]
rec[2]["i8"]: [(null)]
Expand All @@ -415,12 +436,12 @@ rec[2]["u32"]: [(null)]
rec[2]["u64"]: [(null)]
rec[2]["f32"]: [(null)]
rec[2]["f64"]: [(null)]
rec[2]["str"]: [(null)]
rec[2]["str"]: [%s]
rec[2]["ts"]: [(null)]
rec[2]["list(i64)"]: [(null)]
rec[2]["binary"]: [(null)]
rec[2]["uuid"]: [(null)]
`
`, str1Value, str2Value)
got, want := out.String(), want
require.Equal(t, want, got)

Expand All @@ -434,7 +455,7 @@ rec[2]["uuid"]: [(null)]
csv.WithAllocator(mem),
csv.WithComment('#'), csv.WithComma(';'),
csv.WithHeader(withHeader),
csv.WithNullReader(true),
csv.WithNullReader(stringsCanBeNull),
)

r.Next()
Expand Down Expand Up @@ -894,7 +915,7 @@ func TestInferCSVOptions(t *testing.T) {
}, nil)
expRec, _, _ := array.RecordFromJSON(mem, expSchema, strings.NewReader(`[
{"f64": 1.1, "i32": -1, "bool": true, "str": "str-1", "i64": -1, "u64": 1, "i8": -1},
{"f64": 2.2, "i32": -2, "bool": false, "str": "str-2", "i64": -2, "u64": 2, "i8": -2},
{"f64": 2.2, "i32": -2, "bool": false, "str": null, "i64": -2, "u64": 2, "i8": -2},
{"f64": null, "i32": null, "bool": null, "str": null, "i64": null, "u64": null, "i8": null}
]`))
defer expRec.Release()
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/csv/testdata/header.csv
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
#
bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts;list(i64);binary;uuid
true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null
2 changes: 1 addition & 1 deletion go/arrow/csv/testdata/types.csv
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
#
## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp;binary;uuid
true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null

0 comments on commit ad541bb

Please sign in to comment.