From 00c23a133da89924b3b746260cadafcc8d2a9d20 Mon Sep 17 00:00:00 2001 From: tsteven4 <13596209+tsteven4@users.noreply.github.com> Date: Sat, 29 Jun 2024 12:49:56 -0600 Subject: [PATCH] Fix unicsv reader delimiter detection (#1287) * Fix unicsv reader delimiter detection. Any delimiter sequence that is enclosed has never been considered a delimiter when splitting a line into fields. However, previously an enclosed potential delimiter could be detected when scanning the first line, and then erroneously used as the delimiter. * tweak auto * add unicsv delimiter detection test --- csv_util.cc | 9 +++++++- csv_util.h | 3 ++- reference/unidelim.csv | 2 ++ reference/unidelim.gpx | 10 +++++++++ testo.d/unicsv.test | 5 +++++ unicsv.cc | 43 ++++++++++++++++++++++++++------------- unicsv.h | 3 ++- xmldoc/formats/unicsv.xml | 10 ++++++--- 8 files changed, 65 insertions(+), 20 deletions(-) create mode 100644 reference/unidelim.csv create mode 100644 reference/unidelim.gpx diff --git a/csv_util.cc b/csv_util.cc index 1a7ad8420..0fde1d128 100644 --- a/csv_util.cc +++ b/csv_util.cc @@ -148,7 +148,8 @@ csv_dequote(const QString& string, const QString& enclosure) /*****************************************************************************/ QStringList csv_linesplit(const QString& string, const QString& delimited_by, - const QString& enclosed_in, const int line_no, CsvQuoteMethod method) + const QString& enclosed_in, const int line_no, CsvQuoteMethod method, + bool* delimiter_detected) { QStringList retval; @@ -162,6 +163,7 @@ csv_linesplit(const QString& string, const QString& delimited_by, * whitespace eater consume the space. */ QString delimiter = delimited_by; + bool delimiter_seen = false; if (delimited_by == ", ") { delimiter = ","; } @@ -194,8 +196,10 @@ csv_linesplit(const QString& string, const QString& delimited_by, if (!enclosed) { if ((dlen > 0) && string.mid(p).startsWith(delimiter)) { dfound = true; + delimiter_seen = true; } else if (hyper_whitespace_delimiter && string.at(p).isSpace()) { dfound = true; + delimiter_seen = true; while ((p < string.size()) && string.at(p).isSpace()) { p++; } @@ -235,6 +239,9 @@ csv_linesplit(const QString& string, const QString& delimited_by, retval.append(value); } + if (delimiter_detected != nullptr) { + *delimiter_detected = delimiter_seen; + } return retval; } /*****************************************************************************/ diff --git a/csv_util.h b/csv_util.h index 149b4ff00..3a9f96185 100644 --- a/csv_util.h +++ b/csv_util.h @@ -43,7 +43,8 @@ enum class CsvQuoteMethod {historic, rfc4180}; QStringList csv_linesplit(const QString& string, const QString& delimited_by, - const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic); + const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic, + bool* delimiter_detected = nullptr); int dec_to_intdeg(double d); diff --git a/reference/unidelim.csv b/reference/unidelim.csv new file mode 100644 index 000000000..370084bd5 --- /dev/null +++ b/reference/unidelim.csv @@ -0,0 +1,2 @@ +lat,lon,"foo; bar;","bam wham",name,desc,"zoom|zap",notes +41.90270080,12.49623520,this,that,"Roma, 🇮🇹","my ""roam'n"" holiday",the other thing,fun diff --git a/reference/unidelim.gpx b/reference/unidelim.gpx new file mode 100644 index 000000000..b5812dd96 --- /dev/null +++ b/reference/unidelim.gpx @@ -0,0 +1,10 @@ + + + + + + Roma, 🇮🇹 + my "roam'n" holiday + fun + + diff --git a/testo.d/unicsv.test b/testo.d/unicsv.test index 6c31bba94..99bfdc681 100644 --- a/testo.d/unicsv.test +++ b/testo.d/unicsv.test @@ -54,3 +54,8 @@ gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=1 -F ${TMPDI compare ${REFERENCE}/pretty_degree1.csv ${TMPDIR}/pretty_degree1.csv gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=2 -F ${TMPDIR}/pretty_degree2.csv compare ${REFERENCE}/pretty_degree2.csv ${TMPDIR}/pretty_degree2.csv + +# delimiter detection +gpsbabel -i unicsv -f ${REFERENCE}/unidelim.csv -o gpx -F ${TMPDIR}/unidelim.gpx +compare ${REFERENCE}/unidelim.gpx ${TMPDIR}/unidelim.gpx + diff --git a/unicsv.cc b/unicsv.cc index 934df3866..642af6262 100644 --- a/unicsv.cc +++ b/unicsv.cc @@ -385,19 +385,27 @@ void UnicsvFormat::unicsv_fondle_header(QString header) { /* Convert the entire header to lower case for convenience. - * If we see a tab in that header, we decree it to be tabsep. */ - unicsv_fieldsep = ","; - if (header.contains('\t')) { - unicsv_fieldsep = "\t"; - } else if (header.contains(';')) { - unicsv_fieldsep = ";"; - } else if (header.contains('|')) { - unicsv_fieldsep = "|"; - } header = header.toLower(); - const QStringList values = csv_linesplit(header, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180); + /* Find the separator and split the line into fields. + * If we see an unenclosd tab that is the separator. + * Otherwise, if we see an unenclosed semicolon that is the separator. + * Otherwise, if we see an unenclosed vertical bar that is the separator. + * Otherwise the separator is a comma. + */ + const QList delimiters = {"\t", ";", "|", ","}; + unicsv_fieldsep = delimiters.last(); + QStringList values; + bool delimiter_detected; + for (const auto* delimiter : delimiters) { + values = csv_linesplit(header, delimiter, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180, &delimiter_detected); + if (delimiter_detected) { + unicsv_fieldsep = delimiter; + break; + } + } + for (auto value : values) { value = value.trimmed(); @@ -411,8 +419,12 @@ UnicsvFormat::unicsv_fondle_header(QString header) } f++; } - if ((f->name.isEmpty()) && global_opts.debug_level) { - warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value)); + if (global_opts.debug_level) { + if ((f->name.isEmpty()) && global_opts.debug_level) { + warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value)); + } else { + warning(MYNAME ": Interpreting column \"%s\" as %s(%d).\n", qPrintable(value), qPrintable(f->name), f->type); + } } /* handle some special items */ @@ -456,10 +468,12 @@ UnicsvFormat::rd_init(const QString& fname) fin = new gpsbabel::TextStream; fin->open(fname, QIODevice::ReadOnly, MYNAME, opt_codec); + unicsv_lineno = 0; if (opt_fields) { QString fields = QString(opt_fields).replace("+", ","); unicsv_fondle_header(fields); - } else if (buff = fin->readLine(), !buff.isNull()) { + } else if (buff = fin->readLine(); !buff.isNull()) { + ++unicsv_lineno; unicsv_fondle_header(buff); } else { unicsv_fieldsep = nullptr; @@ -508,7 +522,7 @@ UnicsvFormat::unicsv_parse_one_line(const QString& ibuf) wpt->longitude = kUnicsvUnknown; int column = -1; - const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180); + const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180); for (auto value : values) { if (++column >= unicsv_fields_tab.size()) { break; /* ignore extra fields on line */ @@ -1060,6 +1074,7 @@ UnicsvFormat::read() } while ((buff = fin->readLine(), !buff.isNull())) { + ++unicsv_lineno; buff = buff.trimmed(); if (buff.isEmpty() || buff.startsWith('#')) { continue; diff --git a/unicsv.h b/unicsv.h index a8c86664e..a775e4729 100644 --- a/unicsv.h +++ b/unicsv.h @@ -148,7 +148,7 @@ class UnicsvFormat : public Format /* Constants */ - /* "UNICSV_FIELD_SEP" and "UNICSV_LINE_SEP" are only used by the writer */ + /* "kUnicsvFieldSep" and "kUnicsvLineSep" are only used by the writer */ static constexpr const char* kUnicsvFieldSep = ","; static constexpr const char* kUnicsvLineSep = "\r\n"; @@ -189,6 +189,7 @@ class UnicsvFormat : public Format double unicsv_depthscale{}; double unicsv_proximityscale{}; const char* unicsv_fieldsep{nullptr}; + int unicsv_lineno{0}; gpsbabel::TextStream* fin{nullptr}; gpsbabel::TextStream* fout{nullptr}; gpsdata_type unicsv_data_type{unknown_gpsdata}; diff --git a/xmldoc/formats/unicsv.xml b/xmldoc/formats/unicsv.xml index 297a81a98..d75579c74 100644 --- a/xmldoc/formats/unicsv.xml +++ b/xmldoc/formats/unicsv.xml @@ -4,9 +4,13 @@ figure out what data it has and writes headers and all the data it can. - If the first line contains any tabs, the data lines are assumed - to be tab separated. Otherwise the fields are assumed to be - separated by commas. + Fields may be enclosed in double quotes. To include a double quote inside quotes escape it with another double quote. + + + If the first line contains any unenclosed tabs then the data lines are assumed to be tab separated. + Otherwise if the first line contains any unenclosed semicolons then fields are assumed to be separated by semicolons. + Otherwise if the first line contains any unenclosed vertical bars then fields are assumed to be separated by vertical bars. + Otherwise the fields are assumed to be separated by commas. The list of keywords include: