Skip to content

Commit

Permalink
WARCBolt to handle incorrect URIs gracefully, fixes #560
Browse files Browse the repository at this point in the history
  • Loading branch information
jnioche committed Apr 7, 2018
1 parent b39b37c commit 62d4424
Showing 1 changed file with 8 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;

import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Generate a byte representation of a WARC entry from a tuple **/
@SuppressWarnings("serial")
Expand All @@ -30,6 +32,9 @@ public class WARCRecordFormat implements RecordFormat {
private static final String CRLF = "\r\n";
private static final byte[] CRLF_BYTES = { 13, 10 };

private static final Logger LOG = LoggerFactory
.getLogger(WARCRecordFormat.class);

public static final SimpleDateFormat WARC_DF = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH);

Expand Down Expand Up @@ -168,17 +173,15 @@ public byte[] format(Tuple tuple) {
.append(CRLF);
}

String targetURI = null;

// must be a valid URI
try {
String normalised = url.replaceAll(" ", "%20");
URI uri = URI.create(normalised);
targetURI = uri.toASCIIString();
String targetURI = URI.create(normalised).toASCIIString();
buffer.append("WARC-Target-URI").append(": ").append(targetURI)
.append(CRLF);
} catch (Exception e) {
throw new RuntimeException("Invalid URI " + url);
LOG.warn("Incorrect URI: {}", url);
return new byte[] {};
}

// provide a ContentType if type response
Expand Down

0 comments on commit 62d4424

Please sign in to comment.