|
|
@@ -37,6 +37,9 @@ of this software and associated documentation files (the "Software"), to deal |
|
|
|
|
|
|
|
|
public class WikipediaDumpProcessor {
|
|
|
+ private static final int titleStreamId = 0;
|
|
|
+ private static final int bodyStreamId = 1;
|
|
|
+
|
|
|
InputStream inputStream;
|
|
|
OutputStream outputStream;
|
|
|
Scanner scanner;
|
|
|
@@ -87,22 +90,18 @@ private void ProcessDocumentHeader() throws Exception { |
|
|
throw new RuntimeException("Malformed document header.");
|
|
|
}
|
|
|
|
|
|
- String id = matcher.group(1);
|
|
|
+ int documentId = Integer.parseUnsignedInt(matcher.group(1));
|
|
|
+ emit(String.format("%016x", documentId));
|
|
|
String title = matcher.group(2);
|
|
|
|
|
|
- try (StreamScope scope = new StreamScope("title")) {
|
|
|
+ try (StreamScope scope = new StreamScope(titleStreamId)) {
|
|
|
emit(title);
|
|
|
}
|
|
|
-
|
|
|
- // TODO: Should id be an integer in the file?
|
|
|
- try (StreamScope scope = new StreamScope("id")) {
|
|
|
- emit(id);
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
|
|
|
private void ProcessAllContentLines() throws Exception {
|
|
|
- try (StreamScope scope = new StreamScope("content")) {
|
|
|
+ try (StreamScope scope = new StreamScope(bodyStreamId)) {
|
|
|
while (true) {
|
|
|
String line = PeekLine();
|
|
|
if (!IsDocumentEnd(line)) {
|
|
|
@@ -196,8 +195,8 @@ public void close() throws Exception { |
|
|
|
|
|
|
|
|
private class StreamScope implements java.lang.AutoCloseable {
|
|
|
- public StreamScope(String name) {
|
|
|
- emit(name);
|
|
|
+ public StreamScope(int streamId) {
|
|
|
+ emit(String.format("%02x", streamId));
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
|
0 comments on commit
e5b51a0