Permalink
Browse files

Support for hexidecimal DocId and StreamId.

  • Loading branch information...
MikeHopcroft committed Jul 20, 2016
1 parent 5dab1e0 commit e5b51a0df6b263bd939b7fa135001a0543d6302b
@@ -37,6 +37,9 @@ of this software and associated documentation files (the "Software"), to deal
public class WikipediaDumpProcessor {
private static final int titleStreamId = 0;
private static final int bodyStreamId = 1;
InputStream inputStream;
OutputStream outputStream;
Scanner scanner;
@@ -87,22 +90,18 @@ private void ProcessDocumentHeader() throws Exception {
throw new RuntimeException("Malformed document header.");
}
String id = matcher.group(1);
int documentId = Integer.parseUnsignedInt(matcher.group(1));
emit(String.format("%016x", documentId));
String title = matcher.group(2);
try (StreamScope scope = new StreamScope("title")) {
try (StreamScope scope = new StreamScope(titleStreamId)) {
emit(title);
}
// TODO: Should id be an integer in the file?
try (StreamScope scope = new StreamScope("id")) {
emit(id);
}
}
private void ProcessAllContentLines() throws Exception {
try (StreamScope scope = new StreamScope("content")) {
try (StreamScope scope = new StreamScope(bodyStreamId)) {
while (true) {
String line = PeekLine();
if (!IsDocumentEnd(line)) {
@@ -196,8 +195,8 @@ public void close() throws Exception {
private class StreamScope implements java.lang.AutoCloseable {
public StreamScope(String name) {
emit(name);
public StreamScope(int streamId) {
emit(String.format("%02x", streamId));
}
@Override
@@ -69,9 +69,9 @@ public void testWikipediaToCorpus() {
"</doc>\n";
byte[] expected =
("title\0one\0\0id\000123\0\0content\0body\0text\0\0\0" +
"title\0two\0\0id\000456\0\0content\0some\0more\0body\0text\0\0\0" +
"\0").getBytes(StandardCharsets.UTF_8);
("000000000000007b\00000\000one\000\00001\000body\000text\000\000\000" +
"00000000000001c8\00000\000two\000\00001\000some\000more\000body\000text\000\000\000" +
"\000").getBytes(StandardCharsets.UTF_8);
InputStream input = new ByteArrayInputStream(wikipedia.getBytes(StandardCharsets.UTF_8));
ByteArrayOutputStream output = new ByteArrayOutputStream();

0 comments on commit e5b51a0

Please sign in to comment.