Skip to content

Commit 60bd896

Browse files
author
Shawn Smith
committed
Fix json field extraction with mix of nested objects.
1 parent e3bb04e commit 60bd896

File tree

3 files changed

+41
-37
lines changed

3 files changed

+41
-37
lines changed

mr/src/main/java/org/elasticsearch/hadoop/serialization/ParsingUtils.java

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -158,38 +158,18 @@ public static List<Object> values(Parser parser, String... paths) {
158158
}
159159

160160
private static void doFind(Parser parser, List<Matcher> currentMatchers, int level, int maxNesting) {
161-
String currentName;
162161
Token token = parser.currentToken();
163-
164162
if (token == null) {
165-
token = parser.nextToken();
163+
// advance to the initial START_OBJECT token
164+
parser.nextToken();
166165
}
167-
List<Matcher> nextLevel = null;
168-
169-
while ((token = parser.nextToken()) != null) {
170-
if (token == Token.START_OBJECT) {
171-
if (level < maxNesting) {
172-
if (nextLevel != null) {
173-
doFind(parser, nextLevel, level + 1, maxNesting);
174-
}
175-
// first round - a bit exceptional
176-
else if (level == -1) {
177-
doFind(parser, currentMatchers, level + 1, maxNesting);
178-
}
179-
// no need to go deeper, there are no matchers
180-
else {
181-
parser.skipChildren();
182-
}
183-
}
184-
else {
185-
parser.skipChildren();
186-
}
187-
}
188-
else if (token == Token.FIELD_NAME) {
189-
currentName = parser.currentName();
190166

167+
while ((token = parser.nextToken()) != null && token != Token.END_OBJECT) {
168+
if (token == Token.FIELD_NAME) {
169+
String currentName = parser.currentName();
191170
Object value = null;
192171
boolean valueRead = false;
172+
List<Matcher> nextLevel = null;
193173

194174
for (Matcher matcher : currentMatchers) {
195175
if (matcher.matches(currentName, level)) {
@@ -225,15 +205,24 @@ else if (token == Token.FIELD_NAME) {
225205
}
226206
}
227207
}
208+
209+
if (!valueRead) {
210+
// must parse or skip the value
211+
switch (parser.nextToken()) {
212+
case START_OBJECT:
213+
if (level < maxNesting && nextLevel != null) {
214+
doFind(parser, nextLevel, level + 1, maxNesting);
215+
} else {
216+
parser.skipChildren();
217+
}
218+
break;
219+
case START_ARRAY:
220+
// arrays are not handled; simply ignore
221+
parser.skipChildren();
222+
break;
223+
}
224+
}
228225
}
229-
else if (token == Token.END_OBJECT) {
230-
// end current block
231-
}
232-
// arrays are not handled; simply ignore
233-
else if (token == Token.START_ARRAY) {
234-
parser.skipChildren();
235-
}
236-
// ignore other tokens
237226
}
238227
}
239228

mr/src/test/java/org/elasticsearch/hadoop/serialization/JsonValuePathTest.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,4 +104,14 @@ public void testCorrectLevelMatched() throws Exception {
104104
assertEquals(1, vals.size());
105105
assertThat(vals.get(0).toString(), containsString("CA"));
106106
}
107+
108+
@Test
109+
public void testMixedLevels() throws Exception {
110+
List<Object> vals = ParsingUtils.values(parser, "firstName", "address.building.floors", "address.decor.walls", "zzz");
111+
assertEquals(4, vals.size());
112+
assertEquals("John", vals.get(0));
113+
assertEquals(10, vals.get(1));
114+
assertEquals("white", vals.get(2));
115+
assertEquals("end", vals.get(3));
116+
}
107117
}

mr/src/test/resources/org/elasticsearch/hadoop/serialization/parser-test-nested.json

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"bogus":true,
77
"state":"bogus",
88
"building": {
9-
"hight":"bogus",
9+
"height":"bogus",
1010
"floors": 0,
1111
"flats": 0
1212
}
@@ -17,10 +17,14 @@
1717
"state": "NY",
1818
"postalCode": 10021,
1919
"building": {
20-
"hight":"tall",
20+
"height":"tall",
2121
"floors": 10,
2222
"flats": 40
2323
},
24+
"decor": {
25+
"walls": "white",
26+
"floors": "parquet"
27+
},
2428
"firstName":"should-not-be-picked"
2529
},
2630
"phoneNumbers": [
@@ -44,5 +48,6 @@
4448
},
4549
"small-array": [
4650
"foo", "bar"
47-
]
51+
],
52+
"zzz": "end"
4853
}

0 commit comments

Comments
 (0)