Skip to content

Commit

Permalink
[dbs-leipzig#965] Escape delimiters in csv strings (dbs-leipzig#1005)
Browse files Browse the repository at this point in the history
  • Loading branch information
timo95 authored and merando committed Oct 8, 2018
1 parent af0bfd9 commit a48a268
Show file tree
Hide file tree
Showing 22 changed files with 646 additions and 150 deletions.
Expand Up @@ -22,6 +22,7 @@
import org.gradoop.common.model.impl.pojo.Element;
import org.gradoop.flink.io.impl.csv.functions.ElementToPropertyMetaData;
import org.gradoop.flink.io.impl.csv.functions.ReducePropertyMetaData;
import org.gradoop.flink.io.impl.csv.functions.StringEscaper;
import org.gradoop.flink.io.impl.csv.indexed.functions.MultipleFileOutputFormat;
import org.gradoop.flink.io.impl.csv.metadata.MetaDataParser;
import org.gradoop.flink.model.api.epgm.GraphCollection;
Expand Down Expand Up @@ -125,7 +126,8 @@ protected String getEdgeCSVPath() {
*/
protected String getGraphHeadCSVPath(String label) {
Objects.requireNonNull(label);
label = MultipleFileOutputFormat.cleanFilename(label);
label = MultipleFileOutputFormat
.cleanFilename(StringEscaper.escape(label, CSVConstants.ESCAPED_CHARACTERS));
return csvRoot +
GRAPH_HEAD_PATH +
CSVConstants.DIRECTORY_SEPARATOR +
Expand All @@ -142,7 +144,8 @@ protected String getGraphHeadCSVPath(String label) {
*/
protected String getVertexCSVPath(String label) {
Objects.requireNonNull(label);
label = MultipleFileOutputFormat.cleanFilename(label);
label = MultipleFileOutputFormat
.cleanFilename(StringEscaper.escape(label, CSVConstants.ESCAPED_CHARACTERS));
return csvRoot +
VERTEX_PATH +
CSVConstants.DIRECTORY_SEPARATOR +
Expand All @@ -159,7 +162,8 @@ protected String getVertexCSVPath(String label) {
*/
protected String getEdgeCSVPath(String label) {
Objects.requireNonNull(label);
label = MultipleFileOutputFormat.cleanFilename(label);
label = MultipleFileOutputFormat
.cleanFilename(StringEscaper.escape(label, CSVConstants.ESCAPED_CHARACTERS));
return csvRoot +
EDGE_PATH +
CSVConstants.DIRECTORY_SEPARATOR +
Expand Down
Expand Up @@ -15,16 +15,20 @@
*/
package org.gradoop.flink.io.impl.csv;

import com.google.common.collect.ImmutableSet;

import java.util.Set;

/**
* Constants needed for CSV parsing.
*/
public class CSVConstants {
/**
* Used to separate the tokens (id, label, values) in the CSV file.
* Used to separate the tokens (id, label, values) in the CSV files.
*/
public static final String TOKEN_DELIMITER = ";";
/**
* Used to separate the property values in the CSV file.
* Used to separate the property values in the CSV files.
*/
public static final String VALUE_DELIMITER = "|";
/**
Expand Down Expand Up @@ -59,4 +63,9 @@ public class CSVConstants {
* File name for indexed data
*/
public static final String SIMPLE_FILE = "data.csv";
/**
* Characters to be escaped in csv strings
*/
public static final Set<Character> ESCAPED_CHARACTERS = ImmutableSet
.of('\\', ';', ',', '|', ':', '\n', '=');
}
Expand Up @@ -47,11 +47,12 @@ public CSVLineToEdge(EPGMEdgeFactory<Edge> epgmEdgeFactory) {
@Override
public Edge map(String csvLine) throws Exception {
String[] tokens = split(csvLine, 6);
String label = StringEscaper.unescape(tokens[4]);
return edgeFactory.initEdge(GradoopId.fromString(tokens[0]),
tokens[4],
label,
GradoopId.fromString(tokens[2]),
GradoopId.fromString(tokens[3]),
parseProperties(CSVConstants.EDGE_TYPE, tokens[4], tokens[5]),
parseProperties(CSVConstants.EDGE_TYPE, label, tokens[5]),
parseGradoopIds(tokens[1]));
}
}
Expand Up @@ -28,7 +28,6 @@
import org.gradoop.flink.io.impl.csv.metadata.PropertyMetaData;

import java.util.List;
import java.util.regex.Pattern;

/**
* Base class for reading an {@link Element} from CSV. Handles the {@link MetaData} which is
Expand All @@ -41,10 +40,6 @@ abstract class CSVLineToElement<E extends Element> extends RichMapFunction<Strin
* Stores the properties for the {@link Element} to be parsed.
*/
private final Properties properties;
/**
* Needed for splitting the input.
*/
private final String valueDelimiter = Pattern.quote(CSVConstants.VALUE_DELIMITER);
/**
* Meta data that provides parsers for a specific {@link Element}.
*/
Expand Down Expand Up @@ -75,7 +70,8 @@ public void open(Configuration parameters) throws Exception {
* @return parsed properties
*/
Properties parseProperties(String type, String label, String propertyValueString) {
String[] propertyValues = propertyValueString.split(valueDelimiter);
String[] propertyValues = StringEscaper
.split(propertyValueString, CSVConstants.VALUE_DELIMITER);
List<PropertyMetaData> metaDataList = metaData.getPropertyMetaData(type, label);
properties.clear();
for (int i = 0; i < propertyValues.length; i++) {
Expand Down Expand Up @@ -108,13 +104,11 @@ GradoopIdSet parseGradoopIds(String gradoopIdsString) {
/**
* Splits the specified string.
*
* Note: Using {@link Pattern#split(CharSequence)} leads to a significant performance loss.
*
* @param s string
* @param limit resulting array length
* @return tokens
*/
public String[] split(String s, int limit) {
return s.split(CSVConstants.TOKEN_DELIMITER, limit);
return StringEscaper.split(s, CSVConstants.TOKEN_DELIMITER, limit);
}
}
Expand Up @@ -47,10 +47,11 @@ public CSVLineToGraphHead(EPGMGraphHeadFactory<GraphHead> graphHeadFactory) {
@Override
public GraphHead map(String csvLine) throws Exception {
String[] tokens = split(csvLine, 3);
String label = StringEscaper.unescape(tokens[1]);
return graphHeadFactory.initGraphHead(
GradoopId.fromString(tokens[0]),
tokens[1],
parseProperties(CSVConstants.GRAPH_TYPE, tokens[1], tokens[2])
label,
parseProperties(CSVConstants.GRAPH_TYPE, label, tokens[2])
);
}
}
Expand Up @@ -47,10 +47,11 @@ public CSVLineToVertex(EPGMVertexFactory<Vertex> epgmVertexFactory) {
@Override
public Vertex map(String csvLine) throws Exception {
String[] tokens = split(csvLine, 4);
String label = StringEscaper.unescape(tokens[2]);
return vertexFactory.initVertex(
GradoopId.fromString(tokens[0]),
tokens[2],
parseProperties(CSVConstants.VERTEX_TYPE, tokens[2], tokens[3]),
label,
parseProperties(CSVConstants.VERTEX_TYPE, label, tokens[3]),
parseGradoopIds(tokens[1])
);

Expand Down
Expand Up @@ -40,7 +40,7 @@ public CSVEdge map(Edge edge) throws Exception {
csvEdge.setGradoopIds(collectionToCsvString(edge.getGraphIds()));
csvEdge.setSourceId(edge.getSourceId().toString());
csvEdge.setTargetId(edge.getTargetId().toString());
csvEdge.setLabel(edge.getLabel());
csvEdge.setLabel(StringEscaper.escape(edge.getLabel(), CSVConstants.ESCAPED_CHARACTERS));
csvEdge.setProperties(getPropertyString(edge, CSVConstants.EDGE_TYPE));
return csvEdge;
}
Expand Down
Expand Up @@ -95,10 +95,10 @@ private String propertyValueToCsvString(PropertyValue p) {
return collectionToCsvString((Collection) p.getObject());
} else if (p.isMap()) {
return p.getMap().entrySet().stream()
.map(e -> e.getKey().toString() + CSVConstants.MAP_SEPARATOR + e.getValue().toString())
.map(e -> escape(e.getKey()) + CSVConstants.MAP_SEPARATOR + escape(e.getValue()))
.collect(Collectors.joining(CSVConstants.LIST_DELIMITER, "{", "}"));
} else {
return p.toString();
return escape(p);
}
}

Expand All @@ -110,7 +110,20 @@ private String propertyValueToCsvString(PropertyValue p) {
*/
String collectionToCsvString(Collection<?> collection) {
return collection.stream()
.map(Object::toString)
.map(o -> o instanceof PropertyValue ? escape((PropertyValue) o) : o.toString())
.collect(Collectors.joining(CSVConstants.LIST_DELIMITER, "[", "]"));
}

/**
* Returns a escaped string representation of a property value.
*
* @param propertyValue property value to be escaped
* @return escaped string representation
*/
private static String escape(PropertyValue propertyValue) {
if (propertyValue.isString()) {
return StringEscaper.escape(propertyValue.toString(), CSVConstants.ESCAPED_CHARACTERS);
}
return propertyValue.toString();
}
}
Expand Up @@ -62,16 +62,13 @@ public Tuple3<String, String, Set<String>> map(E e) throws Exception {
} else {
throw new Exception("Unsupported element class");
}

reuseTuple.f1 = e.getLabel();

reuseTuple.f1 = StringEscaper.escape(e.getLabel(), CSVConstants.ESCAPED_CHARACTERS);
reuseTuple.f2.clear();
if (e.getProperties() != null) {
for (Property property : e.getProperties()) {
reuseTuple.f2.add(MetaDataParser.getPropertyMetaData(property));
}
}

return reuseTuple;
}
}
Expand Up @@ -37,7 +37,8 @@ public class GraphHeadToCSVGraphHead extends ElementToCSV<GraphHead, CSVGraphHea
@Override
public CSVGraphHead map(GraphHead graphHead) throws Exception {
csvGraphHead.setId(graphHead.getId().toString());
csvGraphHead.setLabel(graphHead.getLabel());
csvGraphHead.setLabel(StringEscaper.escape(graphHead.getLabel(),
CSVConstants.ESCAPED_CHARACTERS));
csvGraphHead.setProperties(getPropertyString(graphHead, CSVConstants.GRAPH_TYPE));
return csvGraphHead;
}
Expand Down

0 comments on commit a48a268

Please sign in to comment.