Skip to content

Commit

Permalink
Fix unicsv reader delimiter detection (#1287)
Browse files Browse the repository at this point in the history
* Fix unicsv reader delimiter detection.

Any delimiter sequence that is enclosed has never been considered
a delimiter when splitting a line into fields.  However, previously
an enclosed potential delimiter could be detected when scanning the
first line, and then erroneously used as the delimiter.

* tweak auto

* add unicsv delimiter detection test
  • Loading branch information
tsteven4 committed Jun 29, 2024
1 parent f778134 commit 00c23a1
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 20 deletions.
9 changes: 8 additions & 1 deletion csv_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ csv_dequote(const QString& string, const QString& enclosure)
/*****************************************************************************/
QStringList
csv_linesplit(const QString& string, const QString& delimited_by,
const QString& enclosed_in, const int line_no, CsvQuoteMethod method)
const QString& enclosed_in, const int line_no, CsvQuoteMethod method,
bool* delimiter_detected)
{
QStringList retval;

Expand All @@ -162,6 +163,7 @@ csv_linesplit(const QString& string, const QString& delimited_by,
* whitespace eater consume the space.
*/
QString delimiter = delimited_by;
bool delimiter_seen = false;
if (delimited_by == ", ") {
delimiter = ",";
}
Expand Down Expand Up @@ -194,8 +196,10 @@ csv_linesplit(const QString& string, const QString& delimited_by,
if (!enclosed) {
if ((dlen > 0) && string.mid(p).startsWith(delimiter)) {
dfound = true;
delimiter_seen = true;
} else if (hyper_whitespace_delimiter && string.at(p).isSpace()) {
dfound = true;
delimiter_seen = true;
while ((p < string.size()) && string.at(p).isSpace()) {
p++;
}
Expand Down Expand Up @@ -235,6 +239,9 @@ csv_linesplit(const QString& string, const QString& delimited_by,
retval.append(value);

}
if (delimiter_detected != nullptr) {
*delimiter_detected = delimiter_seen;
}
return retval;
}
/*****************************************************************************/
Expand Down
3 changes: 2 additions & 1 deletion csv_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ enum class CsvQuoteMethod {historic, rfc4180};

QStringList
csv_linesplit(const QString& string, const QString& delimited_by,
const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic);
const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic,
bool* delimiter_detected = nullptr);

int
dec_to_intdeg(double d);
Expand Down
2 changes: 2 additions & 0 deletions reference/unidelim.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
lat,lon,"foo; bar;","bam wham",name,desc,"zoom|zap",notes
41.90270080,12.49623520,this,that,"Roma, 🇮🇹","my ""roam'n"" holiday",the other thing,fun
10 changes: 10 additions & 0 deletions reference/unidelim.gpx
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<gpx version="1.0" creator="GPSBabel - https://www.gpsbabel.org" xmlns="http://www.topografix.com/GPX/1/0">
<time>1970-01-01T00:00:00Z</time>
<bounds minlat="41.902700800" minlon="12.496235200" maxlat="41.902700800" maxlon="12.496235200"/>
<wpt lat="41.902700800" lon="12.496235200">
<name>Roma, 🇮🇹</name>
<cmt>my &quot;roam'n&quot; holiday</cmt>
<desc>fun</desc>
</wpt>
</gpx>
5 changes: 5 additions & 0 deletions testo.d/unicsv.test
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,8 @@ gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=1 -F ${TMPDI
compare ${REFERENCE}/pretty_degree1.csv ${TMPDIR}/pretty_degree1.csv
gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=2 -F ${TMPDIR}/pretty_degree2.csv
compare ${REFERENCE}/pretty_degree2.csv ${TMPDIR}/pretty_degree2.csv

# delimiter detection
gpsbabel -i unicsv -f ${REFERENCE}/unidelim.csv -o gpx -F ${TMPDIR}/unidelim.gpx
compare ${REFERENCE}/unidelim.gpx ${TMPDIR}/unidelim.gpx

43 changes: 29 additions & 14 deletions unicsv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,19 +385,27 @@ void
UnicsvFormat::unicsv_fondle_header(QString header)
{
/* Convert the entire header to lower case for convenience.
* If we see a tab in that header, we decree it to be tabsep.
*/
unicsv_fieldsep = ",";
if (header.contains('\t')) {
unicsv_fieldsep = "\t";
} else if (header.contains(';')) {
unicsv_fieldsep = ";";
} else if (header.contains('|')) {
unicsv_fieldsep = "|";
}
header = header.toLower();

const QStringList values = csv_linesplit(header, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
/* Find the separator and split the line into fields.
* If we see an unenclosd tab that is the separator.
* Otherwise, if we see an unenclosed semicolon that is the separator.
* Otherwise, if we see an unenclosed vertical bar that is the separator.
* Otherwise the separator is a comma.
*/
const QList<const char*> delimiters = {"\t", ";", "|", ","};
unicsv_fieldsep = delimiters.last();
QStringList values;
bool delimiter_detected;
for (const auto* delimiter : delimiters) {
values = csv_linesplit(header, delimiter, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180, &delimiter_detected);
if (delimiter_detected) {
unicsv_fieldsep = delimiter;
break;
}
}

for (auto value : values) {
value = value.trimmed();

Expand All @@ -411,8 +419,12 @@ UnicsvFormat::unicsv_fondle_header(QString header)
}
f++;
}
if ((f->name.isEmpty()) && global_opts.debug_level) {
warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
if (global_opts.debug_level) {
if ((f->name.isEmpty()) && global_opts.debug_level) {
warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
} else {
warning(MYNAME ": Interpreting column \"%s\" as %s(%d).\n", qPrintable(value), qPrintable(f->name), f->type);
}
}

/* handle some special items */
Expand Down Expand Up @@ -456,10 +468,12 @@ UnicsvFormat::rd_init(const QString& fname)

fin = new gpsbabel::TextStream;
fin->open(fname, QIODevice::ReadOnly, MYNAME, opt_codec);
unicsv_lineno = 0;
if (opt_fields) {
QString fields = QString(opt_fields).replace("+", ",");
unicsv_fondle_header(fields);
} else if (buff = fin->readLine(), !buff.isNull()) {
} else if (buff = fin->readLine(); !buff.isNull()) {
++unicsv_lineno;
unicsv_fondle_header(buff);
} else {
unicsv_fieldsep = nullptr;
Expand Down Expand Up @@ -508,7 +522,7 @@ UnicsvFormat::unicsv_parse_one_line(const QString& ibuf)
wpt->longitude = kUnicsvUnknown;

int column = -1;
const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180);
for (auto value : values) {
if (++column >= unicsv_fields_tab.size()) {
break; /* ignore extra fields on line */
Expand Down Expand Up @@ -1060,6 +1074,7 @@ UnicsvFormat::read()
}

while ((buff = fin->readLine(), !buff.isNull())) {
++unicsv_lineno;
buff = buff.trimmed();
if (buff.isEmpty() || buff.startsWith('#')) {
continue;
Expand Down
3 changes: 2 additions & 1 deletion unicsv.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class UnicsvFormat : public Format

/* Constants */

/* "UNICSV_FIELD_SEP" and "UNICSV_LINE_SEP" are only used by the writer */
/* "kUnicsvFieldSep" and "kUnicsvLineSep" are only used by the writer */

static constexpr const char* kUnicsvFieldSep = ",";
static constexpr const char* kUnicsvLineSep = "\r\n";
Expand Down Expand Up @@ -189,6 +189,7 @@ class UnicsvFormat : public Format
double unicsv_depthscale{};
double unicsv_proximityscale{};
const char* unicsv_fieldsep{nullptr};
int unicsv_lineno{0};
gpsbabel::TextStream* fin{nullptr};
gpsbabel::TextStream* fout{nullptr};
gpsdata_type unicsv_data_type{unknown_gpsdata};
Expand Down
10 changes: 7 additions & 3 deletions xmldoc/formats/unicsv.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
figure out what data it has and writes headers and all the data it can.
</para>
<para>
If the first line contains any tabs, the data lines are assumed
to be tab separated. Otherwise the fields are assumed to be
separated by commas.
Fields may be enclosed in double quotes. To include a double quote inside quotes escape it with another double quote.
</para>
<para>
If the first line contains any unenclosed tabs then the data lines are assumed to be tab separated.
Otherwise if the first line contains any unenclosed semicolons then fields are assumed to be separated by semicolons.
Otherwise if the first line contains any unenclosed vertical bars then fields are assumed to be separated by vertical bars.
Otherwise the fields are assumed to be separated by commas.
</para>
<para>
The list of keywords include:
Expand Down

0 comments on commit 00c23a1

Please sign in to comment.