diff --git a/src/iterators.zig b/src/iterators.zig index 8160bcb..abcda9c 100644 --- a/src/iterators.zig +++ b/src/iterators.zig @@ -2,6 +2,7 @@ //! [Released under GNU LGPLv3] const std = @import("std"); const TableError = @import("zig-csv.zig").TableError; +const Allocator = std.mem.Allocator; /// A struct for iterating over or fetching rows from a parsed table pub const TableIterator = struct { @@ -10,6 +11,8 @@ pub const TableIterator = struct { delimiter: []const u8, header: []const []const u8, body: []const []const u8, + allocator: Allocator, + check_quote: bool, /// Reset the iterator for the function TableIterator.next pub fn reset(self: *TableIterator) void { @@ -23,6 +26,8 @@ pub const TableIterator = struct { const row = RowIterator{ .header = self.header, .row = std.mem.splitSequence(u8, self.body[self.iterator_index], self.delimiter), + .allocator = self.allocator, + .check_quote = self.check_quote, }; self.iterator_index += 1; @@ -37,6 +42,8 @@ pub const TableIterator = struct { return RowIterator{ .header = self.header, .row = std.mem.splitSequence(u8, self.body[row_index], self.delimiter), + .allocator = self.allocator, + .check_quote = self.check_quote, }; } }; @@ -57,6 +64,8 @@ pub const RowIterator = struct { iterator_index: usize = 0, header: []const []const u8, row: std.mem.SplitIterator(u8, .sequence), + allocator: Allocator, + check_quote: bool, /// Reset the iterator for the function RowIterator.next pub fn reset(self: *RowIterator) void { @@ -69,12 +78,18 @@ pub const RowIterator = struct { const value = self.row.next(); if (value == null) return null; - const item = RowItem{ + var item = RowItem{ .column_index = self.iterator_index, .key = self.header[self.iterator_index], .value = value.?, }; + if (self.check_quote and item.value.len > 0 and item.value[0] == '"' and item.value[item.value.len - 1] != '"') { + while (item.value[item.value.len - 1] != '"') { + item.value = std.mem.concat(self.allocator, u8, &[_][]const u8{ item.value, self.row.delimiter, self.row.next().? }) catch item.value; + } + } + self.iterator_index += 1; return item; @@ -85,16 +100,23 @@ pub const RowIterator = struct { var iterator = std.mem.splitSequence(u8, self.row.buffer, self.row.delimiter); var current_column_index: usize = 0; - while (iterator.next()) |value| : (current_column_index += 1) { - if (current_column_index == target_column_index) { - return RowItem{ - .column_index = current_column_index, - .key = self.header[current_column_index], - .value = value, - }; + if (self.check_quote) { + return RowItem{ + .column_index = target_column_index, + .key = self.header[target_column_index], + .value = try getColumnItemInQuote(u8, &iterator, target_column_index, self.allocator), + }; + } else { + while (iterator.next()) |value| : (current_column_index += 1) { + if (current_column_index == target_column_index) { + return RowItem{ + .column_index = current_column_index, + .key = self.header[current_column_index], + .value = value, + }; + } } } - return TableError.IndexNotFound; } }; @@ -114,20 +136,32 @@ pub const ColumnIterator = struct { column_index: usize, delimiter: []const u8, body: []const []const u8, + allocator: Allocator, + check_quote: bool, // Create a ColumnItem from a row fn rowToColumnItem(self: ColumnIterator, row: []const u8) ColumnItem { var item: ColumnItem = undefined; var values = std.mem.splitSequence(u8, row, self.delimiter); - var current_index: usize = 0; - while (values.next()) |value| : (current_index += 1) { - if (current_index == self.column_index) { + if (self.check_quote) { + const value: ?[]const u8 = getColumnItemInQuote(u8, &values, self.column_index, self.allocator) catch null; + if (value != null) { item = ColumnItem{ .row_index = self.iterator_index, - .value = value, + .value = value.?, }; } + } else { + var current_index: usize = 0; + while (values.next()) |value| : (current_index += 1) { + if (current_index == self.column_index) { + item = ColumnItem{ + .row_index = self.iterator_index, + .value = value, + }; + } + } } return item; @@ -160,3 +194,32 @@ pub const ColumnIterator = struct { return item; } }; + +/// Return the value of a column in a row, while discarding delimiters inside "double quotes" +pub fn getColumnItemInQuote(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize, allocator: std.mem.Allocator) TableError![]const T { + var index: usize = 0; + var in_quote = false; + var item_in_quote: []const u8 = ""; + + while (split_iterator.next()) |item| { + if (!in_quote and item.len > 1 and item[0] == '"' and item[item.len - 1] != '"') { // check if item is the beginning of a double quoted value + in_quote = true; + if (index == target_index) item_in_quote = item; + continue; + } else if (in_quote) { // process item inside double quote + // allocate if item needs to be returned + if (index == target_index) { + item_in_quote = try std.mem.concat(allocator, u8, &[_][]const u8{ item_in_quote, split_iterator.delimiter, item }); + } + if (item.len == 0 or item[item.len - 1] != '"') continue; + // item is the end of the double quoted value + in_quote = false; + } + + // return item value + if (item_in_quote.len > 0) return item_in_quote else if (index == target_index) return item; + index += 1; + } + + return TableError.IndexNotFound; +} diff --git a/src/zig-csv.zig b/src/zig-csv.zig index b2efe36..9332af4 100644 --- a/src/zig-csv.zig +++ b/src/zig-csv.zig @@ -7,6 +7,7 @@ const ArrayList = std.ArrayList; const TableIterator = @import("iterators.zig").TableIterator; const RowIterator = @import("iterators.zig").RowIterator; const ColumnIterator = @import("iterators.zig").ColumnIterator; +const getColumnItemInQuote = @import("iterators.zig").getColumnItemInQuote; /// A structure for storing settings for use with struct Table pub const Settings = struct { @@ -14,6 +15,8 @@ pub const Settings = struct { delimiter: []const u8, /// The terminator that defines when a row of delimiter-separated values is terminated terminator: []const u8, + /// The check_quote discards delimiters inside "double quotes" when separating values + check_quote: bool = false, /// A function that returns the default settings that are most commonly used for CSV data /// { .delimiter = ",", .terminator = "\n" } @@ -59,16 +62,22 @@ pub const Table = struct { body: std.ArrayListAligned([]const u8, null), // Return the item with the matching index from an iterator struct std.mem.SplitIterator(T) - fn splitIteratorGetIndex(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { - var index: usize = 0; + fn splitIteratorGetIndex(self: *Table, comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { + if (self.settings.check_quote) { + return getColumnItemInQuote(u8, split_iterator, target_index, self.arena_allocator.allocator()); + } else { + var index: usize = 0; - while (split_iterator.next()) |item| : (index += 1) { - if (index == target_index) { - return item; - } - } + if (!self.settings.check_quote) { + while (split_iterator.next()) |item| : (index += 1) { + if (index == target_index) { + return item; + } + } + } else {} - return TableError.IndexNotFound; + return TableError.IndexNotFound; + } } /// Initialize struct Table @@ -120,11 +129,13 @@ pub const Table = struct { } /// Returns a struct TableIterator containing all rows inside struct Table - pub fn getAllRows(self: Table) TableIterator { + pub fn getAllRows(self: *Table) TableIterator { return TableIterator{ .delimiter = self.settings.delimiter, .header = self.header.items, .body = self.body.items, + .allocator = self.arena_allocator.allocator(), + .check_quote = self.settings.check_quote, }; } @@ -144,7 +155,7 @@ pub const Table = struct { } /// Return a slice of row indexes by a provided column index and searched value - pub fn findRowIndexesByValue(self: Table, allocator: Allocator, column_index: usize, searched_value: []const u8) TableError![]usize { + pub fn findRowIndexesByValue(self: *Table, allocator: Allocator, column_index: usize, searched_value: []const u8) TableError![]usize { var row_indexes = ArrayList(usize).init(allocator); if (column_index >= self.header.items.len) return TableError.IndexNotFound; @@ -153,7 +164,7 @@ pub const Table = struct { const row_count = std.mem.count(u8, row, self.settings.delimiter) + 1; var row_values = std.mem.splitSequence(u8, row, self.settings.delimiter); if (column_index >= row_count) return TableError.MissingValue; - const value = try Table.splitIteratorGetIndex(u8, &row_values, column_index); + const value = try self.splitIteratorGetIndex(u8, &row_values, column_index); if (std.mem.eql(u8, value, searched_value)) { try row_indexes.append(row_index); @@ -166,11 +177,13 @@ pub const Table = struct { } /// Returns a struct ColumnIterator, containing all elements of a given column by its index - pub fn getColumnByIndex(self: Table, column_index: usize) ColumnIterator { + pub fn getColumnByIndex(self: *Table, column_index: usize) ColumnIterator { return ColumnIterator{ .body = self.body.items, .delimiter = self.settings.delimiter, .column_index = column_index, + .allocator = self.arena_allocator.allocator(), + .check_quote = self.settings.check_quote, }; } @@ -181,6 +194,8 @@ pub const Table = struct { return RowIterator{ .header = self.header.items, .row = std.mem.splitSequence(u8, self.body.items[row_index], self.settings.delimiter), + .allocator = self.arena_allocator.allocator(), + .check_quote = self.settings.check_quote, }; }