Skip to content

Commit

Permalink
Fix bug in the transformation of the intervals from token-based to ch…
Browse files Browse the repository at this point in the history
…aracter-based when the same tokens occur subsequently and the annotation is composed by a single token
  • Loading branch information
lfoppiano committed May 4, 2024
1 parent cedee64 commit 83c7a10
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 3 deletions.
Expand Up @@ -1586,7 +1586,7 @@ public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutT
if (StringUtils.isNotEmpty(accumulator)) {
int start = text.indexOf(accumulator.toString(), pos);
newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
pos = textPositionOfToken;
pos = textPositionOfToken + 1;
break;
}
pos = textPositionOfToken;
Expand All @@ -1596,7 +1596,7 @@ public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutT
int start = text.indexOf(accumulator.toString(), pos);
newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
accumulator = new StringBuilder();
pos = textPositionOfToken;
pos = textPositionOfToken + 1;
}

}
Expand Down
Expand Up @@ -13,7 +13,6 @@
import java.util.regex.Matcher;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.*;
Expand Down Expand Up @@ -513,4 +512,39 @@ public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception
assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));

}

@Test
public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
List<OffsetPosition> annotationTokenPositions = Arrays.asList(
new OffsetPosition(4, 7),
new OffsetPosition(9, 12),
new OffsetPosition(15, 18),
new OffsetPosition(27, 30),
new OffsetPosition(49, 52),
new OffsetPosition(71, 74),
new OffsetPosition(103, 106),
new OffsetPosition(109, 110),
new OffsetPosition(125, 126)
);

List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);

assertThat(offsetPositions, hasSize(9));

OffsetPosition url7 = offsetPositions.get(7);
assertThat(url7.start, is(349));
assertThat(url7.end, is(352));

assertThat(input.substring(url7.start, url7.end), is("IGC"));

OffsetPosition url8 = offsetPositions.get(8);
assertThat(url8.start, is(397));
assertThat(url8.end, is(400));

assertThat(input.substring(url8.start, url8.end), is("IGC"));

}
}

0 comments on commit 83c7a10

Please sign in to comment.