Merge 9ad1faa into 46f2e1a

CLARIN-PL · Jun 6, 2019 · 34126a8 · 34126a8
2 parents 46f2e1a + 9ad1faa
commit 34126a8
Show file tree

Hide file tree

Showing 30 changed files with 869 additions and 166 deletions.
diff --git a/2018-05-15-kpwr-event/run_all.sh b/2018-05-15-kpwr-event/run_all.sh
@@ -0,0 +1 @@
+~/projects/Liner2/liner2-cli pipe -m /home/kotu/Downloads/2018-05-15-kpwr-event/events_model_full/cfg.ini -f /home/kotu/Downloads/2018-05-15-kpwr-event/test2.xml -t /home/kotu/Downloads/2018-05-15-kpwr-event/test2.out.xml -i ccl -o cclrel -v
diff --git a/2018-05-15-kpwr-event/train_models.sh b/2018-05-15-kpwr-event/train_models.sh
@@ -0,0 +1,6 @@
+~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_modality_train/cfg.ini
+~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_generality_train/cfg.ini -v
+~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_polarity_train/cfg.ini -v
+~/projects/Liner2/liner2-cli train-rel -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/cfg.ini --mode train -f /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/train.txt -t /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/eventrelations -v --relations alink,slink,null --content
+#test last model
+~/projects/Liner2/liner2-cli train-rel  -m /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/cfg.ini --mode test -f /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/test.txt -t /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/eventrelations -v --relations slink,alink
diff --git a/README.md b/README.md
@@ -36,6 +36,16 @@ Citing
 
 ### System architecture and KPWr NER models
 
+* Java 8
+* C++ compiler (gcc 3.0 or higher) for CRF++ (https://taku910.github.io/crfpp/)
+* set JAVA_HOME variable:
+```bash
+export JAVA_HOME=/usr/lib/jvm/default-java
+```
+* install dh-autoreconf:
+```bash
+sudo apt-get install dh-autoreconf
+```
 Marcińczuk, Michał; Kocoń, Jan; Oleksy, Marcin.
 _Liner2 — a Generic Framework for Named Entity Recognition_
 In: Proceedings of the 6th Workshop on Balto-Slavic Natural Language Processing, 
@@ -225,8 +235,6 @@ Expected output:
 </chunkList>
 ```
 
-
-Complete installation
 ------------
 
 ### Requirements
@@ -235,6 +243,14 @@ Complete installation
 
 * Java 8
 * C++ compiler (gcc 3.0 or higher) for [CRF++](https://taku910.github.io/crfpp/)
+* set JAVA_HOME variable:
+```bash
+export JAVA_HOME=/usr/lib/jvm/default-java
+```
+* install dh-autoreconf:
+```bash
+sudo apt-get install dh-autoreconf
+```
 
 #### Runtime
 
@@ -408,6 +424,7 @@ Expected output:
 Service mode (using RabbitMQ)
 ============
 
+
 Introduction
 ------------
 

diff --git a/download-model-events.sh b/download-model-events.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+if [ ! -d "./2018-05-15-kpwr-event" ]; then
+  wget https://clarin-pl.eu/dspace/bitstream/handle/11321/668/2018-05-15-kpwr-event.tar.gz
+  tar -xzf 2018-05-15-kpwr-event.tar.gz
+fi
diff --git a/g419-corpus/build.gradle b/g419-corpus/build.gradle
@@ -10,4 +10,4 @@ dependencies {
     compile 'commons-io:commons-io:2.4'
     compile 'org.apache.commons:commons-csv:1.2'
     testCompile 'com.cedarsoftware:java-util:1.13.3'
-}
+}
diff --git a/g419-corpus/src/main/java/g419/corpus/io/reader/IobStreamReader.java b/g419-corpus/src/main/java/g419/corpus/io/reader/IobStreamReader.java
@@ -45,7 +45,7 @@ protected void init() throws DataFormatException, IOException {
     }
     if (!header.startsWith(Iob.IOB_HEADER_PREFIX)) {
       throw new DataFormatException(
-          String.format("First line does not contain attributes definition, i.e. '%s a1 a2 a3'", Iob.IOB_HEADER_PREFIX));
+              String.format("First line does not contain attributes definition, i.e. '%s a1 a2 a3'", Iob.IOB_HEADER_PREFIX));
     }
     parseFileHeader(header).stream().forEach(attributeIndex::addAttribute);
     nextFileId = goToNextFileBlock();
@@ -131,8 +131,8 @@ private Document readNextDocument() throws DataFormatException, IOException {
   private void createAnnotations(final Sentence sentence, final List<String> labels) throws DataFormatException {
     final List<Pair<String, Set<Integer>>> groups = labelsToAnnotations(labels);
     groups.stream()
-        .map(p -> new Annotation(p.getRight(), p.getLeft(), sentence))
-        .forEach(sentence::addChunk);
+            .map(p -> new Annotation(p.getRight(), p.getLeft(), sentence))
+            .forEach(sentence::addChunk);
   }
 
   private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String> labels) throws DataFormatException {
@@ -144,21 +144,26 @@ private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String>
     for (final String label : labelsCopy) {
       if (label.equals("O")) {
         annsByType.entrySet().stream()
-            .map(p -> new ImmutablePair<>(p.getKey(), p.getValue()))
-            .forEach(groups::add);
+                .map(p -> new ImmutablePair<>(p.getKey(), p.getValue()))
+                .forEach(groups::add);
         annsByType = Maps.newHashMap();
       } else {
         final Matcher m = Iob.IOB_LABEL_PATTERN.matcher(label);
         while (m.find()) {
           final String annType = m.group(2);
           switch (m.group(1)) {
             case "B":
+              System.out.println("B- OK");
               annsByType.put(annType, Sets.newHashSet(tokenIndex));
               break;
             case "I":
+              System.out.println("I- OK");
               if (annsByType.containsKey(annType)) {
                 annsByType.get(annType).add(tokenIndex);
               } else {
+
+                System.out.println("NOT OK");
+                System.out.println(annType);
                 getLogger().error("Invalid sequence of labels in: " + String.join(" ", labels));
                 //throw new DataFormatException("Invalid sequence of labels");
               }
@@ -173,7 +178,7 @@ private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String>
     return groups;
   }
 
-  private Pair<Token, String> parseToken(final String line, final TokenAttributeIndex index) throws DataFormatException {
+  private Pair<Token, String> parseToken(final String line, final TokenAttributeIndex index) throws DataFormatException{
     final String[] cols = line.split(Iob.IOB_COLUMN_SEPARATOR);
     final String[] attrs = Arrays.copyOfRange(cols, 0, cols.length - 1);
     final String labels = cols[cols.length - 1];
@@ -185,7 +190,7 @@ private Token createToken(final String[] attrs, final TokenAttributeIndex index)
     final Token token = new Token(index);
     if (attrs.length != index.getLength()) {
       throw new DataFormatException("Invalid number of attributes: " + StringUtils.join(attrs)
-          + ". Expecting " + index.getLength());
+              + ". Expecting " + index.getLength());
     }
     for (int i = 0; i < attrs.length; i++) {
       token.setAttributeValue(i, attrs[i]);

diff --git a/g419-liner2-cli/build.gradle b/g419-liner2-cli/build.gradle
@@ -41,4 +41,5 @@ dependencies {
     compile 'commons-cli:commons-cli:1.2'
     compile 'commons-io:commons-io:2.4'
     compile files('../lib/PolemJava.jar')
+    compile files('../lib/fasttext.jar')
 }
diff --git a/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionLearningCurve.java b/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionLearningCurve.java
@@ -23,8 +23,7 @@
 import java.util.regex.Pattern;
 
 /**
- * TODO
- *
+ * Calculates a learning curve for the given dataset
  * @author Jan Kocoń
  */
 public class ActionLearningCurve extends Action {

diff --git a/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionNormalizerEval3.java b/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionNormalizerEval3.java
@@ -22,9 +22,9 @@
 import java.util.stream.Collectors;
 
 /**
- * Chunking in pipe mode.
+ * Evaluation of the normalisation of temporal expressions.
  *
- * @author Maciej Janicki, Michał Marcińczuk
+ * @author Jan Kocoń
  */
 public class ActionNormalizerEval3 extends Action {
 
@@ -34,7 +34,6 @@ public class ActionNormalizerEval3 extends Action {
   private String point_what = null;
   private String point_how = null;
 
-
   public static final String OPTION_CONFIGURATION = "c";
   public static final String OPTION_CONFIGURATION_LONG = "configuration";
 

diff --git a/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionPipe.java b/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionPipe.java
@@ -5,6 +5,7 @@
 import g419.corpus.io.writer.AbstractDocumentWriter;
 import g419.corpus.io.writer.WriterFactory;
 import g419.corpus.structure.Document;
+import g419.corpus.structure.Relation;
 import g419.corpus.structure.RelationSet;
 import g419.lib.cli.Action;
 import g419.lib.cli.CommonOptions;
@@ -74,7 +75,8 @@ public void run() throws Exception {
     final ChunkerManager cm = new ChunkerManager(LinerOptions.getGlobal());
     cm.loadChunkers();
 
-    final Chunker chunker = cm.getChunkerByName(LinerOptions.getGlobal().getOptionUse());
+    Chunker chunker = cm.getChunkerByName(LinerOptions.getGlobal().getOptionUse());
+    chunker.setFeatureGenerator(gen);
 
     try (final AbstractDocumentReader reader = getInputReader();
          final AbstractDocumentWriter writer = getOutputWriter()

diff --git a/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionTrainRelations.java b/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionTrainRelations.java
@@ -0,0 +1,178 @@
+package g419.liner2.cli.action;
+
+import fasttext.Args;
+import fasttext.Pair;
+import g419.corpus.io.reader.AbstractDocumentReader;
+import g419.corpus.io.reader.ReaderFactory;
+import g419.corpus.io.writer.AbstractDocumentWriter;
+import g419.corpus.io.writer.WriterFactory;
+import g419.corpus.structure.*;
+import g419.lib.cli.Action;
+import g419.lib.cli.CommonOptions;
+import g419.lib.cli.ParameterException;
+import g419.liner2.core.LinerOptions;
+import g419.liner2.core.chunker.Chunker;
+import g419.liner2.core.chunker.FastTextRelationChunker;
+import g419.liner2.core.chunker.factory.ChunkerManager;
+import g419.liner2.core.features.TokenFeatureGenerator;
+import org.apache.commons.cli.CommandLine;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import fasttext.FastText;
+import org.apache.commons.cli.Option;
+
+
+/**
+ * Training model with in-sentence relations like slink, alink
+ *
+ * @author Jan Kocoń
+ */
+public class ActionTrainRelations extends Action {
+
+    private String input_file = null;
+    private String input_format = "batch:cclrel";
+    private String output_prefix = null;
+    private String mode = null;
+    private Set<String> chosenRelations = null;
+    private boolean content = false;
+
+    public ActionTrainRelations() {
+        super("train-rel");
+        this.setDescription("processes data with given model");
+
+        this.options.addOption(CommonOptions.getInputFileNameOption());
+        this.options.addOption(CommonOptions.getOutputFileNameOption());
+        this.options.addOption(CommonOptions.getModelFileOption());
+        this.options.addOption(Option.builder("mode").longOpt("mode")
+                .required()
+                .hasArg().argName("mode").desc("choose mode (train, test)").build());
+        this.options.addOption(Option.builder("relations")
+                .longOpt("relations")
+                .hasArg().argName("relations").desc("define relation subset, e.g.: alink,slink,null").build());
+        this.options.addOption(Option.builder("content")
+                .longOpt("content")
+                .desc("include content between annotations in training/testing data").build());
+
+    }
+
+    protected ActionTrainRelations(final String name) {
+        super(name);
+    }
+
+    @Override
+    public void parseOptions(final CommandLine line) throws Exception {
+        this.output_prefix = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FILE);
+        this.input_file = line.getOptionValue(CommonOptions.OPTION_INPUT_FILE);
+        this.mode = line.getOptionValue("mode");
+        this.content = line.hasOption("content");
+        if (!this.mode.equals("train") && !this.mode.equals("test") )
+            throw new Exception("mode must be 'train' or 'test'!");
+        this.chosenRelations = new HashSet<String>(Arrays.asList(line.getOptionValue("relations").split(",")));
+        LinerOptions.getGlobal().parseModelIni(line.getOptionValue(CommonOptions.OPTION_MODEL));
+    }
+
+
+
+    public String getRepresentation(Annotation annotationFrom, Annotation annotationTo, String type) throws IllegalArgumentException{
+        String representation = FastTextRelationChunker.getRepresentation(annotationFrom, annotationTo, this.content);
+        if (representation != null)
+            return "__label__" + type + " " + representation;
+        return null;
+    }
+
+    /**
+     * Module entry function.
+     */
+    public void run() throws Exception {
+
+
+        AbstractDocumentReader reader = ReaderFactory.get().getStreamReader(this.input_file, this.input_format);
+
+        TokenFeatureGenerator gen = null;
+
+        if (!LinerOptions.getGlobal().features.isEmpty()) {
+            gen = new TokenFeatureGenerator(LinerOptions.getGlobal().features);
+        }
+
+        PrintWriter writer = new PrintWriter(this.output_prefix + "." + this.mode + ".txt", "UTF-8");
+        Document ps = reader.nextDocument();
+        while (ps != null) {
+            if (gen != null)
+                gen.generateFeatures(ps);
+            Set<Relation> relations = ps.getRelationsSet();
+
+            Map<Map.Entry<Annotation, Annotation>, String> relationAnnotationTypes = new HashMap<>();
+            Set<Map.Entry<Annotation, Annotation>> relationAnnotations = new HashSet<>();
+            for (Relation relation : relations) {
+                Annotation annotationFrom = relation.getAnnotationFrom();
+                Annotation annotationTo = relation.getAnnotationTo();
+                String type = relation.getType();
+                if (this.chosenRelations.contains(type)) {
+                    Map.Entry<Annotation, Annotation> entry = new AbstractMap.SimpleEntry<>(annotationFrom, annotationTo);
+                    relationAnnotations.add(entry);
+                    relationAnnotationTypes.put(entry, type);
+                }
+            }
+
+
+            for (Map.Entry<Sentence, AnnotationSet> entry : ps.getChunkings().entrySet()) {
+                Sentence sentence = entry.getKey();
+                LinkedHashSet<Annotation> annotationSet = entry.getValue().chunkSet();
+                if (annotationSet.size() > 1)
+                    for (Annotation annotationFrom : annotationSet)
+                        for (Annotation annotationTo : annotationSet)
+                            if (!annotationFrom.equals(annotationTo)) {
+                                Map.Entry<Annotation, Annotation> annotationEntry = new AbstractMap.SimpleEntry<>(annotationFrom, annotationTo);
+                                String representation = null;
+                                String relationType = "null";
+                                if (relationAnnotations.contains(annotationEntry))
+                                    relationType = relationAnnotationTypes.get(annotationEntry);
+                                if (chosenRelations == null || chosenRelations.contains(relationType))
+                                    representation = getRepresentation(annotationFrom, annotationTo, relationType);
+                                if (representation != null)
+                                    writer.println(representation);
+                            }
+            }
+
+            ps = reader.nextDocument();
+        }
+
+        reader.close();
+        writer.close();
+
+        if (this.mode.equals("train")) {
+            FastText fasttext = new FastText();
+            Args a = new Args();
+            a.parseArgs(new String[]{
+                    "supervised",
+                    "-input", this.output_prefix + ".train.txt",
+                    "-output", this.output_prefix + ".model",
+                    "-dim", "50",
+                    "-epoch", "100",
+                    "-ws", "5",
+                    "-wordNgrams", "2",
+                    "-minn", "0",
+                    "-maxn", "3",
+                    "-lr", "0.1",
+                    "-loss", "softmax",
+                    "-thread", "12",
+                    "-label", "__label__"
+            });
+            fasttext.train(a);
+        }
+        else {
+            FastText fasttext = new FastText();
+            fasttext.loadModel(this.output_prefix + ".model.bin");
+            fasttext.test(new FileInputStream(new File(this.output_prefix + ".test.txt")), 1);
+        }
+
+    }
+
+
+}
diff --git a/g419-liner2-core/build.gradle b/g419-liner2-core/build.gradle
@@ -16,4 +16,6 @@ dependencies {
     compile "joda-time:joda-time:2.8.1"
     compile 'com.googlecode.json-simple:json-simple:1.1.1'
     runtime 'org.slf4j:slf4j-log4j12:1.7.2'
-}
+    compile files('../lib/PolemJava.jar')
+    compile files('../lib/fasttext.jar')
+}