Anonymize action

CLARIN-PL · Mar 28, 2019 · fc61ca6 · fc61ca6
1 parent ace9176
commit fc61ca6
Show file tree

Hide file tree

Showing 3 changed files with 221 additions and 24 deletions.
diff --git a/g419-corpus/src/main/java/g419/corpus/io/reader/parser/CclSaxParser.java b/g419-corpus/src/main/java/g419/corpus/io/reader/parser/CclSaxParser.java
@@ -168,7 +168,7 @@ public void endElement(final String s, final String s1, final String element) th
                     break;
                 }
             }
-            if (!foundDisamb) {
+            if (!foundDisamb && tags.size() > 0) {
                 currentToken.setAttributeValue(attributeIndex.getIndex("base"), tags.get(0).getBase());
                 currentToken.setAttributeValue(attributeIndex.getIndex("ctag"), tags.get(0).getCtag());
             }

diff --git a/g419-corpus/src/main/java/g419/corpus/structure/Token.java b/g419-corpus/src/main/java/g419/corpus/structure/Token.java
@@ -36,16 +36,16 @@ public Token(final String orth, final Tag firstTag, final TokenAttributeIndex at
         if (index == -1) {
             throw new Error("TokenAttribute Index does not contain the 'orth' attribute");
         }
-        this.setAttributeValue(index, orth);
-        this.addTag(firstTag);
+        setAttributeValue(index, orth);
+        addTag(firstTag);
     }
 
     public void clearAttributes() {
-        this.attributes = new ArrayList<>();
+        attributes = new ArrayList<>();
     }
 
     public void removeAttribute(final int attrIdx) {
-        this.attributes.remove(attrIdx);
+        attributes.remove(attrIdx);
     }
 
     /**
@@ -60,7 +60,7 @@ public String getAttributeValue(final int index) {
     }
 
     public String getAttributeValue(final String attr) {
-        final int index = this.attrIdx.getIndex(attr);
+        final int index = attrIdx.getIndex(attr);
         return getAttributeValue(index);
     }
 
@@ -69,11 +69,11 @@ public int getNumAttributes() {
     }
 
     public Map<String, String> getProps() {
-        return this.props;
+        return props;
     }
 
     public void setProp(final String name, final String value) {
-        this.props.put(name, value);
+        props.put(name, value);
     }
 
     /**
@@ -87,6 +87,10 @@ public String getOrth() {
         return attributes.get(attrIdx.getIndex("orth"));
     }
 
+    public void setOrth(final String orth) {
+        attributes.set(attrIdx.getIndex("orth"), orth);
+    }
+
     /**
      * Gets the element.
      *
@@ -98,16 +102,16 @@ public String getElement(final String key) {
     }
 
     public boolean getNoSpaceAfter() {
-        return this.noSpaceAfter;
+        return noSpaceAfter;
     }
 
     public void addTag(final Tag tag) {
         tags.add(tag);
         if (attrIdx.getIndex("base") != -1 && attributes.get(attrIdx.getIndex("base")) == null) {
-            this.setAttributeValue(this.attrIdx.getIndex("base"), tag.getBase());
+            setAttributeValue(attrIdx.getIndex("base"), tag.getBase());
         }
         if (attrIdx.getIndex("ctag") != -1 && attributes.get(attrIdx.getIndex("ctag")) == null) {
-            this.setAttributeValue(this.attrIdx.getIndex("ctag"), tag.getCtag());
+            setAttributeValue(attrIdx.getIndex("ctag"), tag.getCtag());
         }
     }
 
@@ -117,7 +121,7 @@ public ArrayList<Tag> getTags() {
 
     public Set<String> getDisambBases() {
         final Set<String> bases = new HashSet<>();
-        for (final Tag tag : this.tags) {
+        for (final Tag tag : tags) {
             if (tag.getDisamb()) {
                 bases.add(tag.getBase());
             }
@@ -137,13 +141,13 @@ public String toString() {
     }
 
     public Tag getDisambTag() {
-        for (final Tag tag : this.tags) {
+        for (final Tag tag : tags) {
             if (tag.getDisamb()) {
                 return tag;
             }
         }
-        if (this.tags.size() > 0) {
-            return this.tags.get(0);
+        if (tags.size() > 0) {
+            return tags.get(0);
         }
         return null;
     }
@@ -173,7 +177,7 @@ public void setAttributeValue(final int index, final String value) {
     }
 
     public void setAttributeValue(final String attr, final String value) {
-        final int index = this.attrIdx.getIndex(attr);
+        final int index = attrIdx.getIndex(attr);
         setAttributeValue(index, value);
     }
 
@@ -183,19 +187,19 @@ public void setNoSpaceAfter(final boolean noSpaceAfter) {
 
     public String getAttributesAsString() {
         final StringBuilder sb = new StringBuilder();
-        for (final String attr : this.attributes) {
+        for (final String attr : attributes) {
             sb.append((sb.length() == 0 ? "" : ", ") + attr);
         }
         return sb.toString();
     }
 
     @Override
     public Token clone() {
-        final Token cloned = new Token(this.attrIdx.clone());
-        cloned.tags = new ArrayList<>(this.tags);
-        cloned.attributes = new ArrayList<>(this.attributes);
+        final Token cloned = new Token(attrIdx.clone());
+        cloned.tags = new ArrayList<>(tags);
+        cloned.attributes = new ArrayList<>(attributes);
         cloned.id = id;
-        cloned.noSpaceAfter = this.noSpaceAfter;
+        cloned.noSpaceAfter = noSpaceAfter;
         return cloned;
     }
 
@@ -210,11 +214,11 @@ public void setAttributeIndex(final TokenAttributeIndex newAttrIdx) {
     }
 
     public boolean isWrapped() {
-        return this.getClass().isInstance(WrappedToken.class);
+        return getClass().isInstance(WrappedToken.class);
     }
 
     public TokenAttributeIndex getAttributeIndex() {
-        return this.attrIdx;
+        return attrIdx;
     }
 
     /**
@@ -225,7 +229,7 @@ public TokenAttributeIndex getAttributeIndex() {
      * @return true, if successful
      */
     public boolean hasBase(final String base, final boolean disambOnly) {
-        for (final Tag tag : this.tags) {
+        for (final Tag tag : tags) {
             if (tag.getBase().equals(base) && (disambOnly == false || tag.getDisamb() == true)) {
                 return true;
             }

diff --git a/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionAnonimization.java b/g419-liner2-cli/src/main/java/g419/liner2/cli/action/ActionAnonimization.java
@@ -0,0 +1,193 @@
+package g419.liner2.cli.action;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import g419.corpus.io.reader.AbstractDocumentReader;
+import g419.corpus.io.reader.ReaderFactory;
+import g419.corpus.io.writer.AbstractDocumentWriter;
+import g419.corpus.io.writer.WriterFactory;
+import g419.corpus.structure.Document;
+import g419.corpus.structure.Sentence;
+import g419.corpus.structure.Token;
+import g419.lib.cli.Action;
+import g419.lib.cli.CommonOptions;
+import io.vavr.collection.Stream;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class ActionAnonimization extends Action {
+
+    class PhraseType {
+        private String phrase;
+        private String type;
+
+        public PhraseType(final String phrase, final String type) {
+            this.phrase = phrase;
+            this.type = type;
+        }
+
+        public String getPhrase() {
+            return phrase;
+        }
+
+        public void setPhrase(final String phrase) {
+            this.phrase = phrase;
+        }
+
+        public String getType() {
+            return type;
+        }
+
+        public void setType(final String type) {
+            this.type = type;
+        }
+    }
+
+    private final String OPTION_DICTIONARY_FILE = "d";
+
+    private String inputFile = null;
+    private String inputFormat = null;
+    private String outputFolder = null;
+    private String outputFormat = null;
+    private String dictFile = null;
+
+    public ActionAnonimization() {
+        super("anonim");
+        setDescription("anonimize according to given dictionary");
+        options.addOption(CommonOptions.getInputFileFormatOption());
+        options.addOption(CommonOptions.getInputFileNameOption());
+        options.addOption(CommonOptions.getOutputFileNameOption());
+        options.addOption(CommonOptions.getOutputFileFormatOption());
+        options.addOption(getDictionaryFile());
+    }
+
+    @Override
+    public void parseOptions(final CommandLine line) throws Exception {
+        inputFile = line.getOptionValue(CommonOptions.OPTION_INPUT_FILE);
+        inputFormat = line.getOptionValue(CommonOptions.OPTION_INPUT_FORMAT, "ccl");
+        outputFolder = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FILE);
+        outputFormat = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FORMAT, "ccl");
+        dictFile = line.getOptionValue(OPTION_DICTIONARY_FILE);
+    }
+
+    public Option getDictionaryFile() {
+        return Option.builder(OPTION_DICTIONARY_FILE).longOpt("dict")
+                .hasArg().argName("path").desc("dictionary of phrases per document").build();
+    }
+
+
+    @Override
+    public void run() throws Exception {
+        final Map<String, List<PhraseType>> phrases = loadPhrases(Paths.get(dictFile));
+        getLogger().info("Number of documents with phrases to anonymize: {}", phrases.size());
+
+        try (final AbstractDocumentReader reader = ReaderFactory.get().getStreamReader(inputFile, inputFormat);
+             final AbstractDocumentWriter writer = WriterFactory.get().getStreamWriter(outputFolder, inputFormat)) {
+            reader.forEachRemaining(document -> {
+                final String docId = document.getName();
+                final List<PhraseType> documentPhrases = phrases.computeIfAbsent(docId, d -> Lists.newArrayList());
+                anonymizePhrases(document, documentPhrases);
+                writer.writeDocument(document);
+            });
+        }
+    }
+
+    private void anonymizePhrases(final Document d, final List<PhraseType> documentPhrases) {
+        getLogger().info("[{}] Number of phrases: {}", d.getName(), documentPhrases.size());
+        final Set<String> replaced = Sets.newHashSet();
+        documentPhrases.forEach(phrase -> {
+            if (replaced.contains(phrase.getPhrase())) {
+                getLogger().error("[{}] PHRASE '{}' already replaced", d.getName(), phrase.getPhrase());
+            } else {
+                final int replaceCount = anonymizePhrase(d, phrase);
+                if (replaceCount > 0) {
+                    getLogger().info("[{}] PHRASE '{}' found {} times and replaced with '@{}'",
+                            d.getName(), phrase.getPhrase(), replaceCount, phrase.getType());
+                    replaced.add(phrase.getPhrase());
+                } else {
+                    getLogger().error("[{}] PHRASE '{}' not found", d.getName(), phrase.getPhrase());
+                }
+            }
+        });
+    }
+
+    private int anonymizePhrase(final Document d, final PhraseType phrase) {
+        final List<List<Token>> matches = matchPhrase(d, phrase.getPhrase());
+        matches.forEach(m -> anonymizePhrase(m, "@" + phrase.getType()));
+        return matches.size();
+    }
+
+    private void anonymizePhrase(final List<Token> tokens, final String label) {
+        tokens.forEach(t -> {
+            t.setOrth(label);
+            t.getTags().forEach(tag -> tag.setBase(label));
+        });
+    }
+
+    private List<List<Token>> matchPhrase(final Document d, final String phrase) {
+        return d.getSentences()
+                .stream()
+                .map(s -> matchPhrase(s, phrase))
+                .flatMap(Collection::stream)
+                .collect(Collectors.toList());
+    }
+
+
+    private List<List<Token>> matchPhrase(final Sentence s, final String phrase) {
+        final List<Token> tokens = s.getTokens();
+        final List<List<Token>> matches = Lists.newArrayList();
+        int n = 0;
+        while (n < tokens.size()) {
+            final List<Token> match = matchPhraseAtPos(tokens, phrase, n);
+            if (match != null) {
+                matches.add(match);
+                n += match.size();
+            }
+            n++;
+        }
+        return matches;
+    }
+
+    private List<Token> matchPhraseAtPos(final List<Token> tokens, final String phrase, final int pos) {
+        String concat = tokens.get(pos).getOrth();
+        int n = pos;
+        while (!concat.equals(phrase) && n + 1 < tokens.size() && concat.length() < phrase.length()) {
+            if (!tokens.get(n).getNoSpaceAfter()) {
+                concat += " ";
+            }
+            concat += tokens.get(++n).getOrth();
+        }
+        if (concat.equals(phrase)) {
+            return Stream.rangeClosed(n, pos)
+                    .map(tokens::get)
+                    .collect(Collectors.toList());
+        } else {
+            return null;
+        }
+    }
+
+    public Map<String, List<PhraseType>> loadPhrases(final Path path) throws IOException {
+        final CSVParser csv = new CSVParser(new FileReader(path.toFile()), CSVFormat.TDF);
+        final Map<String, List<PhraseType>> phrases = Maps.newHashMap();
+        csv.forEach(record -> {
+            final String docId = record.get(0);
+            final PhraseType phraseType = new PhraseType(record.get(4), record.get(3));
+            phrases.computeIfAbsent(docId, d -> Lists.newArrayList()).add(phraseType);
+        });
+        return phrases;
+    }
+
+}