Skip to content

Commit

Permalink
Merge 9ad1faa into 46f2e1a
Browse files Browse the repository at this point in the history
  • Loading branch information
KoconJan committed Jun 6, 2019
2 parents 46f2e1a + 9ad1faa commit 34126a8
Show file tree
Hide file tree
Showing 30 changed files with 869 additions and 166 deletions.
1 change: 1 addition & 0 deletions 2018-05-15-kpwr-event/run_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
~/projects/Liner2/liner2-cli pipe -m /home/kotu/Downloads/2018-05-15-kpwr-event/events_model_full/cfg.ini -f /home/kotu/Downloads/2018-05-15-kpwr-event/test2.xml -t /home/kotu/Downloads/2018-05-15-kpwr-event/test2.out.xml -i ccl -o cclrel -v
6 changes: 6 additions & 0 deletions 2018-05-15-kpwr-event/train_models.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_modality_train/cfg.ini
~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_generality_train/cfg.ini -v
~/projects/Liner2/liner2-cli train -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_polarity_train/cfg.ini -v
~/projects/Liner2/liner2-cli train-rel -m /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/cfg.ini --mode train -f /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/train.txt -t /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/eventrelations -v --relations alink,slink,null --content
#test last model
~/projects/Liner2/liner2-cli train-rel -m /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/cfg.ini --mode test -f /home/kotu/Downloads/2018-05-15-kpwr-event/2018-05-15-kpwr-event_all/test.txt -t /home/kotu/Downloads/2018-05-15-kpwr-event/model_event_relations/eventrelations -v --relations slink,alink
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ Citing

### System architecture and KPWr NER models

* Java 8
* C++ compiler (gcc 3.0 or higher) for CRF++ (https://taku910.github.io/crfpp/)
* set JAVA_HOME variable:
```bash
export JAVA_HOME=/usr/lib/jvm/default-java
```
* install dh-autoreconf:
```bash
sudo apt-get install dh-autoreconf
```
Marcińczuk, Michał; Kocoń, Jan; Oleksy, Marcin.
_Liner2 — a Generic Framework for Named Entity Recognition_
In: Proceedings of the 6th Workshop on Balto-Slavic Natural Language Processing,
Expand Down Expand Up @@ -225,8 +235,6 @@ Expected output:
</chunkList>
```


Complete installation
------------

### Requirements
Expand All @@ -235,6 +243,14 @@ Complete installation

* Java 8
* C++ compiler (gcc 3.0 or higher) for [CRF++](https://taku910.github.io/crfpp/)
* set JAVA_HOME variable:
```bash
export JAVA_HOME=/usr/lib/jvm/default-java
```
* install dh-autoreconf:
```bash
sudo apt-get install dh-autoreconf
```

#### Runtime

Expand Down Expand Up @@ -408,6 +424,7 @@ Expected output:
Service mode (using RabbitMQ)
============


Introduction
------------

Expand Down
5 changes: 5 additions & 0 deletions download-model-events.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash
if [ ! -d "./2018-05-15-kpwr-event" ]; then
wget https://clarin-pl.eu/dspace/bitstream/handle/11321/668/2018-05-15-kpwr-event.tar.gz
tar -xzf 2018-05-15-kpwr-event.tar.gz
fi
2 changes: 1 addition & 1 deletion g419-corpus/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ dependencies {
compile 'commons-io:commons-io:2.4'
compile 'org.apache.commons:commons-csv:1.2'
testCompile 'com.cedarsoftware:java-util:1.13.3'
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ protected void init() throws DataFormatException, IOException {
}
if (!header.startsWith(Iob.IOB_HEADER_PREFIX)) {
throw new DataFormatException(
String.format("First line does not contain attributes definition, i.e. '%s a1 a2 a3'", Iob.IOB_HEADER_PREFIX));
String.format("First line does not contain attributes definition, i.e. '%s a1 a2 a3'", Iob.IOB_HEADER_PREFIX));
}
parseFileHeader(header).stream().forEach(attributeIndex::addAttribute);
nextFileId = goToNextFileBlock();
Expand Down Expand Up @@ -131,8 +131,8 @@ private Document readNextDocument() throws DataFormatException, IOException {
private void createAnnotations(final Sentence sentence, final List<String> labels) throws DataFormatException {
final List<Pair<String, Set<Integer>>> groups = labelsToAnnotations(labels);
groups.stream()
.map(p -> new Annotation(p.getRight(), p.getLeft(), sentence))
.forEach(sentence::addChunk);
.map(p -> new Annotation(p.getRight(), p.getLeft(), sentence))
.forEach(sentence::addChunk);
}

private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String> labels) throws DataFormatException {
Expand All @@ -144,21 +144,26 @@ private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String>
for (final String label : labelsCopy) {
if (label.equals("O")) {
annsByType.entrySet().stream()
.map(p -> new ImmutablePair<>(p.getKey(), p.getValue()))
.forEach(groups::add);
.map(p -> new ImmutablePair<>(p.getKey(), p.getValue()))
.forEach(groups::add);
annsByType = Maps.newHashMap();
} else {
final Matcher m = Iob.IOB_LABEL_PATTERN.matcher(label);
while (m.find()) {
final String annType = m.group(2);
switch (m.group(1)) {
case "B":
System.out.println("B- OK");
annsByType.put(annType, Sets.newHashSet(tokenIndex));
break;
case "I":
System.out.println("I- OK");
if (annsByType.containsKey(annType)) {
annsByType.get(annType).add(tokenIndex);
} else {

System.out.println("NOT OK");
System.out.println(annType);
getLogger().error("Invalid sequence of labels in: " + String.join(" ", labels));
//throw new DataFormatException("Invalid sequence of labels");
}
Expand All @@ -173,7 +178,7 @@ private List<Pair<String, Set<Integer>>> labelsToAnnotations(final List<String>
return groups;
}

private Pair<Token, String> parseToken(final String line, final TokenAttributeIndex index) throws DataFormatException {
private Pair<Token, String> parseToken(final String line, final TokenAttributeIndex index) throws DataFormatException{
final String[] cols = line.split(Iob.IOB_COLUMN_SEPARATOR);
final String[] attrs = Arrays.copyOfRange(cols, 0, cols.length - 1);
final String labels = cols[cols.length - 1];
Expand All @@ -185,7 +190,7 @@ private Token createToken(final String[] attrs, final TokenAttributeIndex index)
final Token token = new Token(index);
if (attrs.length != index.getLength()) {
throw new DataFormatException("Invalid number of attributes: " + StringUtils.join(attrs)
+ ". Expecting " + index.getLength());
+ ". Expecting " + index.getLength());
}
for (int i = 0; i < attrs.length; i++) {
token.setAttributeValue(i, attrs[i]);
Expand Down
1 change: 1 addition & 0 deletions g419-liner2-cli/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ dependencies {
compile 'commons-cli:commons-cli:1.2'
compile 'commons-io:commons-io:2.4'
compile files('../lib/PolemJava.jar')
compile files('../lib/fasttext.jar')
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
import java.util.regex.Pattern;

/**
* TODO
*
* Calculates a learning curve for the given dataset
* @author Jan Kocoń
*/
public class ActionLearningCurve extends Action {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
import java.util.stream.Collectors;

/**
* Chunking in pipe mode.
* Evaluation of the normalisation of temporal expressions.
*
* @author Maciej Janicki, Michał Marcińczuk
* @author Jan Kocoń
*/
public class ActionNormalizerEval3 extends Action {

Expand All @@ -34,7 +34,6 @@ public class ActionNormalizerEval3 extends Action {
private String point_what = null;
private String point_how = null;


public static final String OPTION_CONFIGURATION = "c";
public static final String OPTION_CONFIGURATION_LONG = "configuration";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import g419.corpus.io.writer.AbstractDocumentWriter;
import g419.corpus.io.writer.WriterFactory;
import g419.corpus.structure.Document;
import g419.corpus.structure.Relation;
import g419.corpus.structure.RelationSet;
import g419.lib.cli.Action;
import g419.lib.cli.CommonOptions;
Expand Down Expand Up @@ -74,7 +75,8 @@ public void run() throws Exception {
final ChunkerManager cm = new ChunkerManager(LinerOptions.getGlobal());
cm.loadChunkers();

final Chunker chunker = cm.getChunkerByName(LinerOptions.getGlobal().getOptionUse());
Chunker chunker = cm.getChunkerByName(LinerOptions.getGlobal().getOptionUse());
chunker.setFeatureGenerator(gen);

try (final AbstractDocumentReader reader = getInputReader();
final AbstractDocumentWriter writer = getOutputWriter()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package g419.liner2.cli.action;

import fasttext.Args;
import fasttext.Pair;
import g419.corpus.io.reader.AbstractDocumentReader;
import g419.corpus.io.reader.ReaderFactory;
import g419.corpus.io.writer.AbstractDocumentWriter;
import g419.corpus.io.writer.WriterFactory;
import g419.corpus.structure.*;
import g419.lib.cli.Action;
import g419.lib.cli.CommonOptions;
import g419.lib.cli.ParameterException;
import g419.liner2.core.LinerOptions;
import g419.liner2.core.chunker.Chunker;
import g419.liner2.core.chunker.FastTextRelationChunker;
import g419.liner2.core.chunker.factory.ChunkerManager;
import g419.liner2.core.features.TokenFeatureGenerator;
import org.apache.commons.cli.CommandLine;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import fasttext.FastText;
import org.apache.commons.cli.Option;


/**
* Training model with in-sentence relations like slink, alink
*
* @author Jan Kocoń
*/
public class ActionTrainRelations extends Action {

private String input_file = null;
private String input_format = "batch:cclrel";
private String output_prefix = null;
private String mode = null;
private Set<String> chosenRelations = null;
private boolean content = false;

public ActionTrainRelations() {
super("train-rel");
this.setDescription("processes data with given model");

this.options.addOption(CommonOptions.getInputFileNameOption());
this.options.addOption(CommonOptions.getOutputFileNameOption());
this.options.addOption(CommonOptions.getModelFileOption());
this.options.addOption(Option.builder("mode").longOpt("mode")
.required()
.hasArg().argName("mode").desc("choose mode (train, test)").build());
this.options.addOption(Option.builder("relations")
.longOpt("relations")
.hasArg().argName("relations").desc("define relation subset, e.g.: alink,slink,null").build());
this.options.addOption(Option.builder("content")
.longOpt("content")
.desc("include content between annotations in training/testing data").build());

}

protected ActionTrainRelations(final String name) {
super(name);
}

@Override
public void parseOptions(final CommandLine line) throws Exception {
this.output_prefix = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FILE);
this.input_file = line.getOptionValue(CommonOptions.OPTION_INPUT_FILE);
this.mode = line.getOptionValue("mode");
this.content = line.hasOption("content");
if (!this.mode.equals("train") && !this.mode.equals("test") )
throw new Exception("mode must be 'train' or 'test'!");
this.chosenRelations = new HashSet<String>(Arrays.asList(line.getOptionValue("relations").split(",")));
LinerOptions.getGlobal().parseModelIni(line.getOptionValue(CommonOptions.OPTION_MODEL));
}



public String getRepresentation(Annotation annotationFrom, Annotation annotationTo, String type) throws IllegalArgumentException{
String representation = FastTextRelationChunker.getRepresentation(annotationFrom, annotationTo, this.content);
if (representation != null)
return "__label__" + type + " " + representation;
return null;
}

/**
* Module entry function.
*/
public void run() throws Exception {


AbstractDocumentReader reader = ReaderFactory.get().getStreamReader(this.input_file, this.input_format);

TokenFeatureGenerator gen = null;

if (!LinerOptions.getGlobal().features.isEmpty()) {
gen = new TokenFeatureGenerator(LinerOptions.getGlobal().features);
}

PrintWriter writer = new PrintWriter(this.output_prefix + "." + this.mode + ".txt", "UTF-8");
Document ps = reader.nextDocument();
while (ps != null) {
if (gen != null)
gen.generateFeatures(ps);
Set<Relation> relations = ps.getRelationsSet();

Map<Map.Entry<Annotation, Annotation>, String> relationAnnotationTypes = new HashMap<>();
Set<Map.Entry<Annotation, Annotation>> relationAnnotations = new HashSet<>();
for (Relation relation : relations) {
Annotation annotationFrom = relation.getAnnotationFrom();
Annotation annotationTo = relation.getAnnotationTo();
String type = relation.getType();
if (this.chosenRelations.contains(type)) {
Map.Entry<Annotation, Annotation> entry = new AbstractMap.SimpleEntry<>(annotationFrom, annotationTo);
relationAnnotations.add(entry);
relationAnnotationTypes.put(entry, type);
}
}


for (Map.Entry<Sentence, AnnotationSet> entry : ps.getChunkings().entrySet()) {
Sentence sentence = entry.getKey();
LinkedHashSet<Annotation> annotationSet = entry.getValue().chunkSet();
if (annotationSet.size() > 1)
for (Annotation annotationFrom : annotationSet)
for (Annotation annotationTo : annotationSet)
if (!annotationFrom.equals(annotationTo)) {
Map.Entry<Annotation, Annotation> annotationEntry = new AbstractMap.SimpleEntry<>(annotationFrom, annotationTo);
String representation = null;
String relationType = "null";
if (relationAnnotations.contains(annotationEntry))
relationType = relationAnnotationTypes.get(annotationEntry);
if (chosenRelations == null || chosenRelations.contains(relationType))
representation = getRepresentation(annotationFrom, annotationTo, relationType);
if (representation != null)
writer.println(representation);
}
}

ps = reader.nextDocument();
}

reader.close();
writer.close();

if (this.mode.equals("train")) {
FastText fasttext = new FastText();
Args a = new Args();
a.parseArgs(new String[]{
"supervised",
"-input", this.output_prefix + ".train.txt",
"-output", this.output_prefix + ".model",
"-dim", "50",
"-epoch", "100",
"-ws", "5",
"-wordNgrams", "2",
"-minn", "0",
"-maxn", "3",
"-lr", "0.1",
"-loss", "softmax",
"-thread", "12",
"-label", "__label__"
});
fasttext.train(a);
}
else {
FastText fasttext = new FastText();
fasttext.loadModel(this.output_prefix + ".model.bin");
fasttext.test(new FileInputStream(new File(this.output_prefix + ".test.txt")), 1);
}

}


}
4 changes: 3 additions & 1 deletion g419-liner2-core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ dependencies {
compile "joda-time:joda-time:2.8.1"
compile 'com.googlecode.json-simple:json-simple:1.1.1'
runtime 'org.slf4j:slf4j-log4j12:1.7.2'
}
compile files('../lib/PolemJava.jar')
compile files('../lib/fasttext.jar')
}
Loading

0 comments on commit 34126a8

Please sign in to comment.