Skip to content

Commit

Permalink
Fix #15: Implement CsvParser.Feature.SKIP_EMPTY_LINES
Browse files Browse the repository at this point in the history
  • Loading branch information
vboulaye authored and cowtowncoder committed Oct 8, 2019
1 parent 4eff590 commit 32437c0
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 45 deletions.
2 changes: 1 addition & 1 deletion csv/pom.xml
Expand Up @@ -40,7 +40,7 @@ abstractions.
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
<version>25.0-jre</version>
<scope>test</scope>
</dependency>
</dependencies>
Expand Down
Expand Up @@ -73,13 +73,14 @@ public enum Feature
IGNORE_TRAILING_UNMAPPABLE(false),

/**
* Feature that allows skipping input lines that are completely empty, instead
* Feature that allows skipping input lines that are completely empty or blank (composed only of whitespace),
* instead of being decoded as lines of just a single column with an empty/blank String value (or,
* of being decoded as lines of just a single column with empty String value (or,
* depending on binding, `null`).
*<p>
* Feature is disabled by default.
*
* @since 2.9
* @since 2.10
*/
SKIP_EMPTY_LINES(false),

Expand Down Expand Up @@ -787,19 +788,19 @@ protected void _readHeaderLine() throws IOException {
*/
protected JsonToken _handleStartDoc() throws IOException
{
// also, if comments enabled, may need to skip leading ones
_reader.skipLeadingComments();
// also, if comments enabled, or skip empty lines, may need to skip leading ones
_reader.skipLinesWhenNeeded();
// First things first: are we expecting header line? If so, read, process
if (_schema.usesHeader()) {
_readHeaderLine();
_reader.skipLeadingComments();
_reader.skipLinesWhenNeeded();
}
// and if we are to skip the first data line, skip it
if (_schema.skipsFirstDataRow()) {
_reader.skipLine();
_reader.skipLeadingComments();
_reader.skipLinesWhenNeeded();
}

// Only one real complication, actually; empty documents (zero bytes).
// Those have no entries. Should be easy enough to detect like so:
final boolean wrapAsArray = Feature.WRAP_AS_ARRAY.enabledIn(_formatFeatures);
Expand Down
Expand Up @@ -65,7 +65,12 @@ public class CsvDecoder
protected boolean _trimSpaces;

protected boolean _allowComments;


/**
* @since 2.10.1
*/
protected boolean _skipBlankLines; // NOTE: can be final in 3.0, not before

/**
* Maximum of quote character, linefeeds (\r and \n), escape character.
*/
Expand Down Expand Up @@ -111,14 +116,14 @@ public class CsvDecoder
* needs to be handled (indicates end-of-record).
*/
protected int _pendingLF = 0;

/**
* Flag that indicates whether parser is closed or not. Gets
* set when parser is either closed by explicit call
* ({@link #close}) or when end-of-input is reached.
*/
protected boolean _closed;

/*
/**********************************************************************
/* Current input location information
Expand Down Expand Up @@ -152,7 +157,7 @@ public class CsvDecoder
* For big (gigabyte-sized) sizes are possible, needs to be long,
* unlike pointers and sizes related to in-memory buffers.
*/
protected long _tokenInputTotal = 0;
protected long _tokenInputTotal = 0;

/**
* Input row on which current token starts, 1-based
Expand Down Expand Up @@ -202,8 +207,7 @@ public class CsvDecoder

final static double MIN_INT_D = Integer.MIN_VALUE;
final static double MAX_INT_D = Integer.MAX_VALUE;



// Digits, numeric
final protected static int INT_0 = '0';
final protected static int INT_1 = '1';
Expand Down Expand Up @@ -254,8 +258,8 @@ public class CsvDecoder
/**********************************************************************
*/

@SuppressWarnings("deprecation")
public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema, TextBuffer textBuffer,
public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema,
TextBuffer textBuffer,
int stdFeatures, int csvFeatures)
{
_owner = owner;
Expand All @@ -266,6 +270,7 @@ public CsvDecoder(CsvParser owner, IOContext ctxt, Reader r, CsvSchema schema, T
final boolean legacy = JsonParser.Feature.ALLOW_YAML_COMMENTS.enabledIn(stdFeatures);
_allowComments = legacy | CsvParser.Feature.ALLOW_COMMENTS.enabledIn(csvFeatures);
_trimSpaces = CsvParser.Feature.TRIM_SPACES.enabledIn(csvFeatures);
_skipBlankLines = CsvParser.Feature.SKIP_EMPTY_LINES.enabledIn(csvFeatures);
_inputBuffer = ctxt.allocTokenBuffer();
_bufferRecyclable = true; // since we allocated it
_inputSource = r;
Expand All @@ -292,6 +297,7 @@ public void setSchema(CsvSchema schema)
*/
public void overrideFormatFeatures(int csvFeatures) {
_trimSpaces = CsvParser.Feature.TRIM_SPACES.enabledIn(csvFeatures);
_skipBlankLines = CsvParser.Feature.SKIP_EMPTY_LINES.enabledIn(csvFeatures);
}

/*
Expand Down Expand Up @@ -482,39 +488,53 @@ public boolean startNewLine() throws IOException
}
_handleLF();
}
/* For now, we will only require that there is SOME data
* following linefeed -- even spaces will do.
* In future we may want to use better heuristics to possibly
* skip trailing empty line?
*/
if ((_inputPtr >= _inputEnd) && !loadMore()) {
return false;
}

if (_allowComments && _inputBuffer[_inputPtr] == '#') {
int i = _skipCommentLines();
// end-of-input?
if (i < 0) {
return false;
}
// otherwise push last read char back
--_inputPtr;
}
return true;
return skipLinesWhenNeeded();
}

public void skipLeadingComments() throws IOException
{
if (_allowComments) {
if ((_inputPtr < _inputEnd) || loadMore()) {
if (_inputBuffer[_inputPtr] == '#') {
_skipCommentLines();
--_inputPtr;
/**
* optionally skip lines that are empty or are comments, depending on the feature activated in the parser
* @return false if the end of input was reached
* @throws IOException
* @since 2.10.1
*/
public boolean skipLinesWhenNeeded() throws IOException {
if (!(_allowComments || _skipBlankLines)) {
return hasMoreInput();
}
int firstCharacterPtr = _inputPtr;
while (hasMoreInput()) {
char ch = _inputBuffer[_inputPtr++];
if (ch == '\r' || ch == '\n') {
_pendingLF = ch;
_handleLF();
// track the start of the new line
firstCharacterPtr = _inputPtr;
continue;
}
if (ch == ' ') {
// skip all blanks (in both comments/blanks skip mode)
continue;
}
if (_allowComments) {
if (_inputBuffer[firstCharacterPtr] == '#') {
// on a commented line, skip everything
continue;
}
if (ch == '#') {
// we reach this point when whitespaces precedes the hash character
// move the firstCharacterPtr to the '#' location in order to skip the line completely
firstCharacterPtr = _inputPtr-1;
continue;
}
}
// we reached a non skippable character, this line needs to be parsed
// rollback the input pointer to the beginning of the line
_inputPtr = firstCharacterPtr;
return true; // processing can go on
}
return false; // end of input
}

protected int _skipCommentLines() throws IOException
{
while ((_inputPtr < _inputEnd) || loadMore()) {
Expand Down
Expand Up @@ -8,7 +8,7 @@
// Tests for [csv#56]
public class CommentsTest extends ModuleTestBase
{
final String CSV_WITH_COMMENTS = "x,y\n# comment!\na,b\n# another...\n";
final String CSV_WITH_COMMENTS = "x,y\n# comment!\na,b\n # another...\n";

public void testWithoutComments() throws Exception
{
Expand Down
@@ -0,0 +1,159 @@
package com.fasterxml.jackson.dataformat.csv.deser;

import com.fasterxml.jackson.databind.ObjectReader;
import com.fasterxml.jackson.dataformat.csv.CsvParser;
import com.fasterxml.jackson.dataformat.csv.ModuleTestBase;

import static org.junit.Assert.assertArrayEquals;

// for [dataformats-text#15]: Allow skipping of empty lines
public class SkipBlankLines15Test extends ModuleTestBase {

private static final String CSV_WITH_EMPTY_LINE = "1,\"xyz\"\n\ntrue,\n";
private static final String CSV_WITH_BLANK_LINE = "1,\"xyz\"\n \ntrue,\n";
private static final String CSV_WITH_BLANK_LINE_AND_COMMENT = "1,\"xyz\"\n \n #comment\n\ntrue,\n";
private static final String CSV_WITH_FIRST_BLANK_LINE = "\n1,\"xyz\"\ntrue,\n";
private static final String CSV_WITH_TRAILING_BLANK_LINES = "1,\"xyz\"\ntrue,\n \n\n";

public void testCsvWithEmptyLineSkipBlankLinesFeatureDisabled() throws Exception {
String[][] rows = mapperForCsvAsArray().readValue(CSV_WITH_EMPTY_LINE);
// First, verify default behavior:
assertArrayEquals(expected(
row("1", "xyz"),
row(""),
row("true", "")
), rows);
}

public void testCsvWithEmptyLineSkipBlankLinesFeatureEnabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.readValue(CSV_WITH_EMPTY_LINE);
// empty line is skipped
assertArrayEquals(expected(
row("1", "xyz"),
row("true", "")
), rows);
}


public void testCsvWithBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.readValue(CSV_WITH_BLANK_LINE);
// First, verify default behavior:
assertArrayEquals(expected(
row("1", "xyz"),
row(" "),
row("true", "")
), rows);
}

public void testCsvWithBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.readValue(CSV_WITH_BLANK_LINE);
// blank line is skipped
assertArrayEquals(expected(
row("1", "xyz"),
row("true", "")
), rows);
}

public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureDisabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
// First, verify default behavior:
assertArrayEquals(expected(
row("1", "xyz"),
row(" "),
row(" #comment"),
row(""),
row("true", "")
), rows);
}

public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureEnabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
// blank/empty lines are skipped
assertArrayEquals(expected(
row("1", "xyz"),
row(" #comment"),
row("true", "")
), rows);
}

public void testCsvWithBlankLineAndCommentSkipBlankLinesFeatureEnabledAndAllowComments() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.with(CsvParser.Feature.ALLOW_COMMENTS)
.readValue(CSV_WITH_BLANK_LINE_AND_COMMENT);
// blank/empty/comment lines are skipped
assertArrayEquals(expected(
row("1", "xyz"),
row("true", "")
), rows);
}

public void testCsvWithFirstBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.readValue(CSV_WITH_FIRST_BLANK_LINE);
// First, verify default behavior:
assertArrayEquals(expected(
row(""),
row("1", "xyz"),
row("true", "")
), rows);
}

public void testCsvWithFirstBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.readValue(CSV_WITH_FIRST_BLANK_LINE);
// blank line is skipped
assertArrayEquals(expected(
row("1", "xyz"),
row("true", "")
), rows);
}


public void testCsvWithTrailingBlankLineSkipBlankLinesFeatureDisabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.readValue(CSV_WITH_TRAILING_BLANK_LINES);
// First, verify default behavior:
assertArrayEquals(expected(
row("1", "xyz"),
row("true", ""),
row(" "),
row("")
), rows);
}

public void testCsvWithTrailingBlankLineSkipBlankLinesFeatureEnabled() throws Exception {
String[][] rows = mapperForCsvAsArray()
.with(CsvParser.Feature.SKIP_EMPTY_LINES)
.readValue(CSV_WITH_FIRST_BLANK_LINE);
// blank lines are skipped
assertArrayEquals(expected(
row("1", "xyz"),
row("true", "")
), rows);
}

private ObjectReader mapperForCsvAsArray() {
// when wrapped as an array, we'll get array of Lists:
return mapperForCsv()
.readerFor(String[][].class)
.with(CsvParser.Feature.WRAP_AS_ARRAY);
}

private String[][] expected(String[]... rowInputs) {
return rowInputs;
}

private String[] row(String... cellInputs) {
return cellInputs;
}
}
2 changes: 1 addition & 1 deletion pom.xml
Expand Up @@ -3,7 +3,7 @@
<parent>
<groupId>com.fasterxml.jackson</groupId>
<artifactId>jackson-base</artifactId>
<version>2.10.0</version>
<version>2.10.1-SNAPSHOT</version>
</parent>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformats-text</artifactId>
Expand Down
5 changes: 5 additions & 0 deletions release-notes/CREDITS-2.x
Expand Up @@ -79,3 +79,8 @@ Matti Bickel (wundrian@github)
Maarten Winkels (mwinkels@github)
* Contributed fix for #83: Update index of sequence context
(2.10.0)

Vincent Boulaye (vboulaye@github)
* Implemented #15: Add a `CsvParser.Feature.SKIP_EMPTY_LINES` to allow
skipping empty rows
(2.10.1)

0 comments on commit 32437c0

Please sign in to comment.