-
Notifications
You must be signed in to change notification settings - Fork 23
SDK Annotate
If users do not want to take advantage of the Annotation CLI tool, it is also straightforward to annotate a document programatically.
The following source code snippet shows how to annotate a document, by creating a processing pipeline and using the data provided on the "example" folder.
// Set files
String documentFile = "example/annotate/in/22528326.txt";
String outputFile = "example/annotate/out/22528326.a1";
// Set resources
String dictionary1File = "example/dictionaries/Body_Part_Organ_or_Organ_Component_T023_ANAT.tsv";
String dictionary2File = "example/dictionaries/Disease_or_Syndrome_T047_DISO.tsv";
String modelFile = "example/models/prge/prge.properties";
// Create reader
Reader reader = new RawReader();
// Create parser
Parser parser = new GDepParser(ParserLanguage.ENGLISH, ParserLevel.CHUNKING,
new LingpipeSentenceSplitter(), false).launch();
// Create NLP
NLP nlp = new NLP(parser);
// Create dictionary matchers
List<String> dictionary1Lines = FileUtils.readLines(new File(dictionary1File));
Dictionary dictionary1 = VariantMatcherLoader.loadDictionaryFromLines(dictionary1Lines);
List<String> dictionary2Lines = FileUtils.readLines(new File(dictionary2File));
Dictionary dictionary2 = VariantMatcherLoader.loadDictionaryFromLines(dictionary2Lines);
DictionaryHybrid dictionaryMatcher1 = new DictionaryHybrid(dictionary1);
DictionaryHybrid dictionaryMatcher2 = new DictionaryHybrid(dictionary2);
// Create machine-learning model matcher
MLModel model = new MLModel("prge", new File(modelFile));
model.initialize();
MLHybrid mlModelMatcher = new MLHybrid(model.getCrf(), "prge");
// Create Writer
Writer writer = new A1Writer();
// Set document stream
InputStream documentStream = new FileInputStream(documentFile);
// Run pipeline to get annotations
Pipeline pipeline = new DefaultPipeline()
.add(reader)
.add(nlp)
.add(dictionaryMatcher1)
.add(dictionaryMatcher2)
.add(mlModelMatcher)
.add(writer);
OutputStream outputStream = pipeline.run(documentStream).get(0);
// Write annotations to output file
FileUtils.writeStringToFile(new File(outputFile), outputStream.toString());
// Close streams
documentStream.close();
outputStream.close();
// Close parser
parser.close();
To annotate a set of documents, users can also develop a pipeline processor and process the documents using the batch helper. First, a processor taking advantage of the pipeline features must be implemented. Then, this pipeline processor must be used to perform batch processing of documents available in a directory.
The next snippets show the construction of a complete pipeline processor, considering a specific context, input and output documents provided in the constructor. And also, how a context is created considering input dictionaries and models directories, and how a batch is created.
public class ExampleProcessor extends BaseProcessor {
private InputFile inputFile;
private List<OutputFile> outputFiles;
private boolean addAnnotationsWithoutIDs;
public ExampleProcessor(Context context, InputFile inputFile, List<OutputFile> outputFiles, boolean addAnnotationsWithoutIDs) {
super(context);
this.inputFile = inputFile;
this.outputFiles = outputFiles;
this.addAnnotationsWithoutIDs = addAnnotationsWithoutIDs;
}
@Override
public void run() {
try {
// Take context and context processors
Context context = getContext();
ContextProcessors cp = getContext().take();
// Get corpus
Corpus corpus = inputFile.getCorpus();
// Create pipeline
Pipeline pipeline = new DefaultPipeline(corpus);
instantiateModules(context.getDictionaries(), context.getModels(),
cp, context, pipeline, null, addAnnotationsWithoutIDs);
// Run pipeline to process file
List<OutputStream> outputStreams = new ArrayList();
for (OutputFile outFile : outputFiles) {
outputStreams.add(outFile.getOutStream());
}
pipeline.run(inputFile.getInStream(), outputStreams);
// Return processors
context.put(cp);
pipeline.clear();
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
}
// Set files
String documentsDirectory = "example/annotate/in/";
String outputDirectory = "example/annotate/out/";
// Set resources
String dictionariesDirectory = "example/dictionaries/";
String modelsDirectory = "example/models/";
// Set input and output formats
InputFormat inputFormat = InputFormat.RAW;
List<OutputFormat> outputFormats = new ArrayList();
outputFormats.add(OutputFormat.A1);
// Create context
ContextConfiguration config = new ContextConfiguration.Builder()
.withInputFormat(inputFormat)
.withOutputFormats(outputFormats)
.withParserTool(ParserTool.GDEP)
.withParserLanguage(ParserLanguage.ENGLISH)
.withParserLevel(ParserLevel.CHUNKING)
.build();
Context context = new Context(config, modelsDirectory, dictionariesDirectory);
// Create batch executor
boolean compressed = false;
int numThreads = 1;
BatchExecutor batch = new FileBatchExecutor(documentsDirectory, outputDirectory, compressed, numThreads, false, true);
// Get processor class
Class c = ExampleProcessor.class;
// Run batch processing
batch.run(c, context);