Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Byte Order Mark (BOM) correctly for CSVs. Fixes #6527 #6528

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions main/src/com/google/refine/importers/FixedWidthImporter.java
Expand Up @@ -27,11 +27,12 @@

package com.google.refine.importers;

import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -184,7 +185,7 @@ static private ArrayList<Object> getCells(String line, int[] widths) {
static public int[] guessColumnWidths(File file, String encoding) {
try {
InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader);

try {
Expand Down
Expand Up @@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -242,7 +241,7 @@ static public class Separator {

static public CsvFormat guessFormat(File file, String encoding) {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
CsvParserSettings settings = new CsvParserSettings();
// We could provide a set of delimiters to consider below if we wanted to restrict this
Expand All @@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) {
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
try {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {

List<Separator> separators = new ArrayList<>();
Expand Down Expand Up @@ -340,4 +339,5 @@ static public Separator guessSeparator(File file, String encoding, boolean handl
}
return null;
}

}
6 changes: 3 additions & 3 deletions main/src/com/google/refine/importers/TextFormatGuesser.java
Expand Up @@ -27,12 +27,13 @@

package com.google.refine.importers;

import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import com.google.common.base.CharMatcher;
Expand All @@ -56,8 +57,7 @@ public String guess(File file, String encoding, String seedFormat) {
}

InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
try (BufferedReader reader = new BufferedReader(
encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
try (BufferedReader reader = new BufferedReader(getInputStreamReader(bis, encoding))) {
int totalChars = 0;
long openBraces = 0;
int closeBraces = 0;
Expand Down
35 changes: 15 additions & 20 deletions main/src/com/google/refine/importing/ImportingUtilities.java
Expand Up @@ -44,7 +44,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
Expand Down Expand Up @@ -107,6 +106,17 @@ public class ImportingUtilities {

final public static List<String> allowedProtocols = Arrays.asList("http", "https", "ftp", "sftp");

public static InputStreamReader getInputStreamReader(InputStream inputStream, String encoding) throws IOException {
if (encoding == null) {
return new InputStreamReader(inputStream);
}
// This isn't a real encoding, so needs to be special cased
if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
}
return new InputStreamReader(inputStream, encoding);
}

static public interface Progress {

public void setProgress(String message, int percent);
Expand Down Expand Up @@ -568,26 +578,11 @@ static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fil
if (encoding == null) {
encoding = commonEncoding;
}
if (encoding != null) {

// Special case for UTF-8 with BOM
if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
try {
return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
} else {
try {
return new InputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
}
}

try {
return getInputStreamReader(inputStream, encoding);
} catch (IOException e) {
throw new RuntimeException("Exception getting InputStreamReader", e);
}
return new InputStreamReader(inputStream);
}

static public File getFile(ImportingJob job, ObjectNode fileRecord) {
Expand Down