Fix #15: Implement CsvParser.Feature.SKIP_EMPTY_LINES

FasterXML · Oct 8, 2019 · 32437c0 · 32437c0
1 parent 4eff590
commit 32437c0
Show file tree

Hide file tree

Showing 8 changed files with 235 additions and 45 deletions.
diff --git a/csv/pom.xml b/csv/pom.xml
@@ -40,7 +40,7 @@ abstractions.
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>18.0</version>
+      <version>25.0-jre</version>
       <scope>test</scope>
     </dependency>
   </dependencies>

diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java
@@ -73,13 +73,14 @@ public enum Feature
         IGNORE_TRAILING_UNMAPPABLE(false),
 
         /**
-         * Feature that allows skipping input lines that are completely empty, instead
+         * Feature that allows skipping input lines that are completely empty or blank (composed only of whitespace),
+         * instead of being decoded as lines of just a single column with an empty/blank String value (or,
          * of being decoded as lines of just a single column with empty String value (or,
          * depending on binding, `null`).
          *<p>
          * Feature is disabled by default.
          *
-         * @since 2.9
+         * @since 2.10
          */
         SKIP_EMPTY_LINES(false),
 
@@ -787,19 +788,19 @@ protected void _readHeaderLine() throws IOException {
      */
     protected JsonToken _handleStartDoc() throws IOException
     {
-        // also, if comments enabled, may need to skip leading ones
-        _reader.skipLeadingComments();
+        // also, if comments enabled, or skip empty lines, may need to skip leading ones
+        _reader.skipLinesWhenNeeded();
         // First things first: are we expecting header line? If so, read, process
         if (_schema.usesHeader()) {
             _readHeaderLine();
-            _reader.skipLeadingComments();
+            _reader.skipLinesWhenNeeded();
         }
         // and if we are to skip the first data line, skip it
         if (_schema.skipsFirstDataRow()) {
             _reader.skipLine();
-            _reader.skipLeadingComments();
+            _reader.skipLinesWhenNeeded();
         }
-        
+
         // Only one real complication, actually; empty documents (zero bytes).
         // Those have no entries. Should be easy enough to detect like so:
         final boolean wrapAsArray = Feature.WRAP_AS_ARRAY.enabledIn(_formatFeatures);

diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvDecoder.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvDecoder.java
@@ -65,7 +65,12 @@ public class CsvDecoder
     protected boolean _trimSpaces;
 
     protected boolean _allowComments;
-
+
+    /**
+     * @since 2.10.1
+     */
+    protected boolean _skipBlankLines; // NOTE: can be final in 3.0, not before
+
     /**
      * Maximum of quote character, linefeeds (\r and \n), escape character.
      */
@@ -111,14 +116,14 @@ public class CsvDecoder
      * needs to be handled (indicates end-of-record).
      */
     protected int _pendingLF = 0;
-    
+
     /**
      * Flag that indicates whether parser is closed or not. Gets
      * set when parser is either closed by explicit call
      * ({@link #close}) or when end-of-input is reached.
      */
     protected boolean _closed;
-    
+
     /*
     /**********************************************************************
     /* Current input location information
@@ -152,7 +157,7 @@ public class CsvDecoder
      * For big (gigabyte-sized) sizes are possible, needs to be long,
      * unlike pointers and sizes related to in-memory buffers.
      */
-    protected long _tokenInputTotal = 0; 
+    protected long _tokenInputTotal = 0;
 
     /**
      * Input row on which current token starts, 1-based
@@ -202,8 +207,7 @@ public class CsvDecoder
 
     final static double MIN_INT_D = Integer.MIN_VALUE;
     final static double MAX_INT_D = Integer.MAX_VALUE;
-
-
+
     // Digits, numeric
     final protected static int INT_0 = '0';
     final protected static int INT_1 = '1';
@@ -254,8 +258,8 @@ public class CsvDecoder
     /**********************************************************************
      */
 
-    @SuppressWarnings("deprecation")
-    public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema, TextBuffer textBuffer,
+    public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema,
+            TextBuffer textBuffer,
             int stdFeatures, int csvFeatures)
     {
         _owner = owner;
@@ -266,6 +270,7 @@ public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema, T
         final boolean legacy = JsonParser.Feature.ALLOW_YAML_COMMENTS.enabledIn(stdFeatures);
         _allowComments = legacy | CsvParser.Feature.ALLOW_COMMENTS.enabledIn(csvFeatures);
         _trimSpaces = CsvParser.Feature.TRIM_SPACES.enabledIn(csvFeatures);
+        _skipBlankLines = CsvParser.Feature.SKIP_EMPTY_LINES.enabledIn(csvFeatures);
         _inputBuffer = ctxt.allocTokenBuffer();
         _bufferRecyclable = true; // since we allocated it
         _inputSource = r;
@@ -292,6 +297,7 @@ public void setSchema(CsvSchema schema)
      */
     public void overrideFormatFeatures(int csvFeatures) {
         _trimSpaces = CsvParser.Feature.TRIM_SPACES.enabledIn(csvFeatures);
+        _skipBlankLines = CsvParser.Feature.SKIP_EMPTY_LINES.enabledIn(csvFeatures);
     }
 
     /*
@@ -482,39 +488,53 @@ public boolean startNewLine() throws IOException
             }
             _handleLF();
         }
-        /* For now, we will only require that there is SOME data
-         * following linefeed -- even spaces will do.
-         * In future we may want to use better heuristics to possibly
-         * skip trailing empty line?
-         */
-        if ((_inputPtr >= _inputEnd) && !loadMore()) {
-            return false;
-        }
-
-        if (_allowComments && _inputBuffer[_inputPtr] == '#') {
-            int i = _skipCommentLines();
-            // end-of-input?
-            if (i < 0) {
-                return false;
-            }
-            // otherwise push last read char back
-            --_inputPtr;
-        }
-        return true;
+        return skipLinesWhenNeeded();
     }
 
-    public void skipLeadingComments() throws IOException
-    {
-        if (_allowComments) {
-            if ((_inputPtr < _inputEnd) || loadMore()) {
-                if (_inputBuffer[_inputPtr] == '#') {
-                    _skipCommentLines();
-                    --_inputPtr;
+    /**
+     * optionally skip lines that are empty or are comments, depending on the feature activated in the parser
+     * @return false if the end of input was reached
+     * @throws IOException
+     * @since 2.10.1
+     */
+    public boolean skipLinesWhenNeeded() throws IOException {
+        if (!(_allowComments || _skipBlankLines)) {
+            return hasMoreInput();
+        }
+        int firstCharacterPtr = _inputPtr;
+        while (hasMoreInput()) {
+            char ch = _inputBuffer[_inputPtr++];
+            if (ch == '\r' || ch == '\n') {
+                _pendingLF = ch;
+                _handleLF();
+                // track the start of the new line
+                firstCharacterPtr = _inputPtr;
+                continue;
+            }
+            if (ch == ' ') {
+                // skip all blanks (in both comments/blanks skip mode)
+                continue;
+            }
+            if (_allowComments) {
+                if (_inputBuffer[firstCharacterPtr] == '#') {
+                    // on a commented line, skip everything
+                    continue;
+                }
+                if (ch == '#') {
+                    // we reach this point when whitespaces precedes the hash character
+                    // move the firstCharacterPtr to the '#' location in order to skip the line completely
+                    firstCharacterPtr = _inputPtr-1;
+                    continue;
                 }
             }
+            // we reached a non skippable character, this line needs to be parsed
+            // rollback the input pointer to the beginning of the line
+            _inputPtr = firstCharacterPtr;
+            return true; // processing can go on
         }
+        return false; // end of input
     }
-    
+
     protected int _skipCommentLines() throws IOException
     {
         while ((_inputPtr < _inputEnd) || loadMore()) {

diff --git a/csv/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/CommentsTest.java b/csv/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/CommentsTest.java
@@ -8,7 +8,7 @@
 // Tests for [csv#56]
 public class CommentsTest extends ModuleTestBase
 {
-    final String CSV_WITH_COMMENTS = "x,y\n# comment!\na,b\n# another...\n";
+    final String CSV_WITH_COMMENTS = "x,y\n# comment!\na,b\n   # another...\n";
 
     public void testWithoutComments() throws Exception
     {

diff --git a/csv/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/SkipBlankLines15Test.java b/csv/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/SkipBlankLines15Test.java
@@ -0,0 +1,159 @@
+package com.fasterxml.jackson.dataformat.csv.deser;
+
+import com.fasterxml.jackson.databind.ObjectReader;
+import com.fasterxml.jackson.dataformat.csv.CsvParser;
+import com.fasterxml.jackson.dataformat.csv.ModuleTestBase;
+
+import static org.junit.Assert.assertArrayEquals;
+
+// for [dataformats-text#15]: Allow skipping of empty lines
+public class SkipBlankLines15Test extends ModuleTestBase {
+
+    private static final String CSV_WITH_EMPTY_LINE = "1,\"xyz\"\n\ntrue,\n";
+    private static final String CSV_WITH_BLANK_LINE = "1,\"xyz\"\n   \ntrue,\n";
+    private static final String CSV_WITH_BLANK_LINE_AND_COMMENT = "1,\"xyz\"\n \n  #comment\n\ntrue,\n";
+    private static final String CSV_WITH_FIRST_BLANK_LINE = "\n1,\"xyz\"\ntrue,\n";
+    private static final String CSV_WITH_TRAILING_BLANK_LINES = "1,\"xyz\"\ntrue,\n  \n\n";
+
+    public void testCsvWithEmptyLineSkipBlankLinesFeatureDisabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray().readValue(CSV_WITH_EMPTY_LINE);
+        // First, verify default behavior:
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row(""),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithEmptyLineSkipBlankLinesFeatureEnabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .readValue(CSV_WITH_EMPTY_LINE);
+        // empty line is skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+
+    public void testCsvWithBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .readValue(CSV_WITH_BLANK_LINE);
+        // First, verify default behavior:
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("   "),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .readValue(CSV_WITH_BLANK_LINE);
+        // blank line is skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureDisabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
+        // First, verify default behavior:
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row(" "),
+                row("  #comment"),
+                row(""),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureEnabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
+        // blank/empty lines are skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("  #comment"),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureEnabledAndAllowComments() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .with(CsvParser.Feature.ALLOW_COMMENTS)
+                .readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
+        // blank/empty/comment lines are skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithFirstBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .readValue(CSV_WITH_FIRST_BLANK_LINE);
+        // First, verify default behavior:
+        assertArrayEquals(expected(
+                row(""),
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+    public void testCsvWithFirstBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .readValue(CSV_WITH_FIRST_BLANK_LINE);
+        // blank line is skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+
+    public void testCsvWithTrailingBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .readValue(CSV_WITH_TRAILING_BLANK_LINES);
+        // First, verify default behavior:
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", ""),
+                row("  "),
+                row("")
+        ), rows);
+    }
+
+    public void testCsvWithTrailingBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
+        String[][] rows = mapperForCsvAsArray()
+                .with(CsvParser.Feature.SKIP_EMPTY_LINES)
+                .readValue(CSV_WITH_FIRST_BLANK_LINE);
+        // blank lines are skipped
+        assertArrayEquals(expected(
+                row("1", "xyz"),
+                row("true", "")
+        ), rows);
+    }
+
+    private ObjectReader mapperForCsvAsArray() {
+        // when wrapped as an array, we'll get array of Lists:
+        return mapperForCsv()
+                .readerFor(String[][].class)
+                .with(CsvParser.Feature.WRAP_AS_ARRAY);
+    }
+
+    private String[][] expected(String[]... rowInputs) {
+        return rowInputs;
+    }
+
+    private  String[] row(String... cellInputs) {
+        return cellInputs;
+    }
+}
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
   <parent>
     <groupId>com.fasterxml.jackson</groupId>
     <artifactId>jackson-base</artifactId>
-    <version>2.10.0</version>
+    <version>2.10.1-SNAPSHOT</version>
   </parent>
   <groupId>com.fasterxml.jackson.dataformat</groupId>
   <artifactId>jackson-dataformats-text</artifactId>

diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x
@@ -79,3 +79,8 @@ Matti Bickel (wundrian@github)
 Maarten Winkels (mwinkels@github)
 * Contributed fix for #83: Update index of sequence context
  (2.10.0)
+
+Vincent Boulaye (vboulaye@github)
+* Implemented #15: Add a `CsvParser.Feature.SKIP_EMPTY_LINES` to allow
+  skipping empty rows
+ (2.10.1)