Skip to content

SDK Annotate

André Santos edited this page Nov 10, 2016 · 7 revisions

If users do not want to take advantage of the Annotation CLI tool, it is also straightforward to annotate a document programatically.

The following source code snippet shows how to annotate a document, by creating a processing pipeline and using the data provided on the "example" folder.

// Set files
String documentFile = "example/annotate/in/22528326.txt";
String outputFile = "example/annotate/out/22528326.a1";
     
// Set resources
String dictionary1File = "example/dictionaries/Body_Part_Organ_or_Organ_Component_T023_ANAT.tsv";
String dictionary2File = "example/dictionaries/Disease_or_Syndrome_T047_DISO.tsv";
String modelFile = "example/models/prge/prge.properties";

// Create reader
Reader reader = new RawReader();
      
// Create parser
Parser parser = new GDepParser(ParserLanguage.ENGLISH, ParserLevel.CHUNKING, 
new LingpipeSentenceSplitter(), false).launch();
       
// Create NLP        
NLP nlp = new NLP(parser);
        
// Create dictionary matchers
List<String> dictionary1Lines = FileUtils.readLines(new File(dictionary1File));
Dictionary dictionary1 = VariantMatcherLoader.loadDictionaryFromLines(dictionary1Lines);
List<String> dictionary2Lines = FileUtils.readLines(new File(dictionary2File));
Dictionary dictionary2 = VariantMatcherLoader.loadDictionaryFromLines(dictionary2Lines);
        
DictionaryHybrid dictionaryMatcher1 = new DictionaryHybrid(dictionary1);
DictionaryHybrid dictionaryMatcher2 = new DictionaryHybrid(dictionary2);
        
// Create machine-learning model matcher
MLModel model = new MLModel("prge", new File(modelFile));
model.initialize();
MLHybrid mlModelMatcher = new MLHybrid(model.getCrf(), "prge");
        
 // Create Writer
 Writer writer = new A1Writer();
        
 // Set document stream
 InputStream documentStream = new FileInputStream(documentFile);

 // Run pipeline to get annotations
 Pipeline pipeline = new DefaultPipeline()
        .add(reader)
        .add(nlp)
        .add(dictionaryMatcher1)
        .add(dictionaryMatcher2)
        .add(mlModelMatcher)
        .add(writer);

OutputStream outputStream = pipeline.run(documentStream).get(0);

// Write annotations to output file
FileUtils.writeStringToFile(new File(outputFile), outputStream.toString());
        
// Close streams
documentStream.close();
outputStream.close();
        
// Close parser
parser.close();

To annotate a set of documents, users can also develop a pipeline processor and process the documents using the batch helper. First, a processor taking advantage of the pipeline features must be implemented. Then, this pipeline processor must be used to perform batch processing of documents available in a directory.

The next snippets show the construction of a complete pipeline processor, considering a specific context, input and output documents provided in the constructor. And also, how a context is created considering input dictionaries and models directories, and how a batch is created.

public class ExampleProcessor extends BaseProcessor {

    private InputFile inputFile;
    private List<OutputFile> outputFiles;
    private boolean addAnnotationsWithoutIDs;
    
    public ExampleProcessor(Context context, InputFile inputFile, List<OutputFile> outputFiles, boolean addAnnotationsWithoutIDs) {
        super(context);
        this.inputFile = inputFile;
        this.outputFiles = outputFiles;
        this.addAnnotationsWithoutIDs = addAnnotationsWithoutIDs;
    }
    
    @Override
    public void run() {
        
        try {
            
            // Take context and context processors
            Context context = getContext();
            ContextProcessors cp = getContext().take();
        
            // Get corpus
            Corpus corpus = inputFile.getCorpus();

            // Create pipeline
            Pipeline pipeline = new DefaultPipeline(corpus);
            instantiateModules(context.getDictionaries(), context.getModels(), 
                    cp, context, pipeline, null, addAnnotationsWithoutIDs);

            // Run pipeline to process file
            List<OutputStream> outputStreams = new ArrayList();
            for (OutputFile outFile : outputFiles) {
                outputStreams.add(outFile.getOutStream());
            }
            
            pipeline.run(inputFile.getInStream(), outputStreams);

            // Return processors
            context.put(cp);
            pipeline.clear();
        
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }       
    }
}
// Set files
String documentsDirectory = "example/annotate/in/";
String outputDirectory = "example/annotate/out/";
        
// Set resources
String dictionariesDirectory = "example/dictionaries/";
String modelsDirectory = "example/models/";
       
// Set input and output formats
InputFormat inputFormat = InputFormat.RAW;
List<OutputFormat> outputFormats = new ArrayList();
outputFormats.add(OutputFormat.A1);
        
// Create context
ContextConfiguration config = new ContextConfiguration.Builder()
         .withInputFormat(inputFormat)
         .withOutputFormats(outputFormats)
         .withParserTool(ParserTool.GDEP)
         .withParserLanguage(ParserLanguage.ENGLISH)
         .withParserLevel(ParserLevel.CHUNKING)
         .build();
        
Context context = new Context(config, modelsDirectory, dictionariesDirectory);
        
// Create batch executor
boolean compressed = false;
int numThreads = 1;
BatchExecutor batch = new FileBatchExecutor(documentsDirectory, outputDirectory, compressed, numThreads, false, true);
  
// Get processor class
Class c = ExampleProcessor.class;
       
// Run batch processing
batch.run(c, context);
Clone this wiki locally