Fix unicsv reader delimiter detection (#1287)

* Fix unicsv reader delimiter detection. Any delimiter sequence that is enclosed has never been considered a delimiter when splitting a line into fields. However, previously an enclosed potential delimiter could be detected when scanning the first line, and then erroneously used as the delimiter. * tweak auto * add unicsv delimiter detection test
GPSBabel · Jun 29, 2024 · 00c23a1 · 00c23a1
1 parent f778134
commit 00c23a1
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 20 deletions.
diff --git a/csv_util.cc b/csv_util.cc
@@ -148,7 +148,8 @@ csv_dequote(const QString& string, const QString& enclosure)
 /*****************************************************************************/
 QStringList
 csv_linesplit(const QString& string, const QString& delimited_by,
-              const QString& enclosed_in, const int line_no, CsvQuoteMethod method)
+              const QString& enclosed_in, const int line_no, CsvQuoteMethod method,
+              bool* delimiter_detected)
 {
   QStringList retval;
 
@@ -162,6 +163,7 @@ csv_linesplit(const QString& string, const QString& delimited_by,
    * whitespace eater consume the space.
    */
   QString delimiter = delimited_by;
+  bool delimiter_seen = false;
   if (delimited_by == ", ") {
     delimiter = ",";
   }
@@ -194,8 +196,10 @@ csv_linesplit(const QString& string, const QString& delimited_by,
       if (!enclosed) {
         if ((dlen > 0) && string.mid(p).startsWith(delimiter)) {
           dfound = true;
+          delimiter_seen = true;
         } else if (hyper_whitespace_delimiter && string.at(p).isSpace()) {
           dfound = true;
+          delimiter_seen = true;
           while ((p < string.size()) && string.at(p).isSpace()) {
             p++;
           }
@@ -235,6 +239,9 @@ csv_linesplit(const QString& string, const QString& delimited_by,
     retval.append(value);
 
   }
+  if (delimiter_detected != nullptr) {
+    *delimiter_detected = delimiter_seen;
+  }
   return retval;
 }
 /*****************************************************************************/

diff --git a/csv_util.h b/csv_util.h
@@ -43,7 +43,8 @@ enum class CsvQuoteMethod {historic, rfc4180};
 
 QStringList
 csv_linesplit(const QString& string, const QString& delimited_by,
-              const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic);
+              const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic,
+              bool* delimiter_detected = nullptr);
 
 int
 dec_to_intdeg(double d);

diff --git a/reference/unidelim.csv b/reference/unidelim.csv
@@ -0,0 +1,2 @@
+lat,lon,"foo; bar;","bam  wham",name,desc,"zoom|zap",notes
+41.90270080,12.49623520,this,that,"Roma, 🇮🇹","my ""roam'n"" holiday",the other thing,fun
diff --git a/reference/unidelim.gpx b/reference/unidelim.gpx
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gpx version="1.0" creator="GPSBabel - https://www.gpsbabel.org" xmlns="http://www.topografix.com/GPX/1/0">
+  <time>1970-01-01T00:00:00Z</time>
+  <bounds minlat="41.902700800" minlon="12.496235200" maxlat="41.902700800" maxlon="12.496235200"/>
+  <wpt lat="41.902700800" lon="12.496235200">
+    <name>Roma, 🇮🇹</name>
+    <cmt>my &quot;roam'n&quot; holiday</cmt>
+    <desc>fun</desc>
+  </wpt>
+</gpx>
diff --git a/testo.d/unicsv.test b/testo.d/unicsv.test
@@ -54,3 +54,8 @@ gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=1 -F ${TMPDI
 compare ${REFERENCE}/pretty_degree1.csv ${TMPDIR}/pretty_degree1.csv
 gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=2 -F ${TMPDIR}/pretty_degree2.csv
 compare ${REFERENCE}/pretty_degree2.csv ${TMPDIR}/pretty_degree2.csv
+
+# delimiter detection
+gpsbabel -i unicsv -f ${REFERENCE}/unidelim.csv -o gpx -F ${TMPDIR}/unidelim.gpx
+compare ${REFERENCE}/unidelim.gpx ${TMPDIR}/unidelim.gpx
+
diff --git a/unicsv.cc b/unicsv.cc
@@ -385,19 +385,27 @@ void
 UnicsvFormat::unicsv_fondle_header(QString header)
 {
   /* Convert the entire header to lower case for convenience.
-   * If we see a tab in that header, we decree it to be tabsep.
    */
-  unicsv_fieldsep = ",";
-  if (header.contains('\t')) {
-    unicsv_fieldsep = "\t";
-  } else if (header.contains(';')) {
-    unicsv_fieldsep = ";";
-  } else if (header.contains('|')) {
-    unicsv_fieldsep = "|";
-  }
   header = header.toLower();
 
-  const QStringList values = csv_linesplit(header, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
+  /* Find the separator and split the line into fields.
+   * If we see an unenclosd tab that is the separator.
+   * Otherwise, if we see an unenclosed semicolon that is the separator.
+   * Otherwise, if we see an unenclosed vertical bar that is the separator.
+   * Otherwise the separator is a comma.
+   */
+  const QList<const char*> delimiters = {"\t", ";", "|", ","};
+  unicsv_fieldsep = delimiters.last();
+  QStringList values;
+  bool delimiter_detected;
+  for (const auto* delimiter : delimiters) {
+    values = csv_linesplit(header, delimiter, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180, &delimiter_detected);
+    if (delimiter_detected) {
+      unicsv_fieldsep = delimiter;
+      break;
+    }
+  }
+
   for (auto value : values) {
     value = value.trimmed();
 
@@ -411,8 +419,12 @@ UnicsvFormat::unicsv_fondle_header(QString header)
       }
       f++;
     }
-    if ((f->name.isEmpty()) && global_opts.debug_level) {
-      warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
+    if (global_opts.debug_level) {
+      if ((f->name.isEmpty()) && global_opts.debug_level) {
+        warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
+      } else {
+        warning(MYNAME ": Interpreting column \"%s\" as %s(%d).\n", qPrintable(value), qPrintable(f->name), f->type);
+      }
     }
 
     /* handle some special items */
@@ -456,10 +468,12 @@ UnicsvFormat::rd_init(const QString& fname)
 
   fin = new gpsbabel::TextStream;
   fin->open(fname, QIODevice::ReadOnly, MYNAME, opt_codec);
+  unicsv_lineno = 0;
   if (opt_fields) {
     QString fields = QString(opt_fields).replace("+", ",");
     unicsv_fondle_header(fields);
-  } else if (buff = fin->readLine(), !buff.isNull()) {
+  } else if (buff = fin->readLine(); !buff.isNull()) {
+    ++unicsv_lineno;
     unicsv_fondle_header(buff);
   } else {
     unicsv_fieldsep = nullptr;
@@ -508,7 +522,7 @@ UnicsvFormat::unicsv_parse_one_line(const QString& ibuf)
   wpt->longitude = kUnicsvUnknown;
 
   int column = -1;
-  const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
+  const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180);
   for (auto value : values) {
     if (++column >= unicsv_fields_tab.size()) {
       break;  /* ignore extra fields on line */
@@ -1060,6 +1074,7 @@ UnicsvFormat::read()
   }
 
   while ((buff = fin->readLine(), !buff.isNull())) {
+    ++unicsv_lineno;
     buff = buff.trimmed();
     if (buff.isEmpty() || buff.startsWith('#')) {
       continue;

diff --git a/unicsv.h b/unicsv.h
@@ -148,7 +148,7 @@ class UnicsvFormat : public Format
 
   /* Constants */
 
-  /* "UNICSV_FIELD_SEP" and "UNICSV_LINE_SEP" are only used by the writer */
+  /* "kUnicsvFieldSep" and "kUnicsvLineSep" are only used by the writer */
 
   static constexpr const char* kUnicsvFieldSep = ",";
   static constexpr const char* kUnicsvLineSep = "\r\n";
@@ -189,6 +189,7 @@ class UnicsvFormat : public Format
   double unicsv_depthscale{};
   double unicsv_proximityscale{};
   const char* unicsv_fieldsep{nullptr};
+  int unicsv_lineno{0};
   gpsbabel::TextStream* fin{nullptr};
   gpsbabel::TextStream* fout{nullptr};
   gpsdata_type unicsv_data_type{unknown_gpsdata};

diff --git a/xmldoc/formats/unicsv.xml b/xmldoc/formats/unicsv.xml
@@ -4,9 +4,13 @@
    figure out what data it has and writes headers and all the data it can.
 </para>
 <para>
-   If the first line contains any tabs, the data lines are assumed
-   to be tab separated.   Otherwise the fields are assumed to be
-   separated by commas.
+   Fields may be enclosed in double quotes.  To include a double quote inside quotes escape it with another double quote.
+</para>
+<para>
+   If the first line contains any unenclosed tabs then the data lines are assumed to be tab separated.
+   Otherwise if the first line contains any unenclosed semicolons then fields are assumed to be separated by semicolons.
+   Otherwise if the first line contains any unenclosed vertical bars then fields are assumed to be separated by vertical bars.
+   Otherwise the fields are assumed to be separated by commas.
 </para>
 <para>
    The list of keywords include: