Skip to content

Commit

Permalink
Anonymize action
Browse files Browse the repository at this point in the history
  • Loading branch information
mczuk committed Mar 28, 2019
1 parent ace9176 commit fc61ca6
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public void endElement(final String s, final String s1, final String element) th
break;
}
}
if (!foundDisamb) {
if (!foundDisamb && tags.size() > 0) {
currentToken.setAttributeValue(attributeIndex.getIndex("base"), tags.get(0).getBase());
currentToken.setAttributeValue(attributeIndex.getIndex("ctag"), tags.get(0).getCtag());
}
Expand Down
50 changes: 27 additions & 23 deletions g419-corpus/src/main/java/g419/corpus/structure/Token.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ public Token(final String orth, final Tag firstTag, final TokenAttributeIndex at
if (index == -1) {
throw new Error("TokenAttribute Index does not contain the 'orth' attribute");
}
this.setAttributeValue(index, orth);
this.addTag(firstTag);
setAttributeValue(index, orth);
addTag(firstTag);
}

public void clearAttributes() {
this.attributes = new ArrayList<>();
attributes = new ArrayList<>();
}

public void removeAttribute(final int attrIdx) {
this.attributes.remove(attrIdx);
attributes.remove(attrIdx);
}

/**
Expand All @@ -60,7 +60,7 @@ public String getAttributeValue(final int index) {
}

public String getAttributeValue(final String attr) {
final int index = this.attrIdx.getIndex(attr);
final int index = attrIdx.getIndex(attr);
return getAttributeValue(index);
}

Expand All @@ -69,11 +69,11 @@ public int getNumAttributes() {
}

public Map<String, String> getProps() {
return this.props;
return props;
}

public void setProp(final String name, final String value) {
this.props.put(name, value);
props.put(name, value);
}

/**
Expand All @@ -87,6 +87,10 @@ public String getOrth() {
return attributes.get(attrIdx.getIndex("orth"));
}

public void setOrth(final String orth) {
attributes.set(attrIdx.getIndex("orth"), orth);
}

/**
* Gets the element.
*
Expand All @@ -98,16 +102,16 @@ public String getElement(final String key) {
}

public boolean getNoSpaceAfter() {
return this.noSpaceAfter;
return noSpaceAfter;
}

public void addTag(final Tag tag) {
tags.add(tag);
if (attrIdx.getIndex("base") != -1 && attributes.get(attrIdx.getIndex("base")) == null) {
this.setAttributeValue(this.attrIdx.getIndex("base"), tag.getBase());
setAttributeValue(attrIdx.getIndex("base"), tag.getBase());
}
if (attrIdx.getIndex("ctag") != -1 && attributes.get(attrIdx.getIndex("ctag")) == null) {
this.setAttributeValue(this.attrIdx.getIndex("ctag"), tag.getCtag());
setAttributeValue(attrIdx.getIndex("ctag"), tag.getCtag());
}
}

Expand All @@ -117,7 +121,7 @@ public ArrayList<Tag> getTags() {

public Set<String> getDisambBases() {
final Set<String> bases = new HashSet<>();
for (final Tag tag : this.tags) {
for (final Tag tag : tags) {
if (tag.getDisamb()) {
bases.add(tag.getBase());
}
Expand All @@ -137,13 +141,13 @@ public String toString() {
}

public Tag getDisambTag() {
for (final Tag tag : this.tags) {
for (final Tag tag : tags) {
if (tag.getDisamb()) {
return tag;
}
}
if (this.tags.size() > 0) {
return this.tags.get(0);
if (tags.size() > 0) {
return tags.get(0);
}
return null;
}
Expand Down Expand Up @@ -173,7 +177,7 @@ public void setAttributeValue(final int index, final String value) {
}

public void setAttributeValue(final String attr, final String value) {
final int index = this.attrIdx.getIndex(attr);
final int index = attrIdx.getIndex(attr);
setAttributeValue(index, value);
}

Expand All @@ -183,19 +187,19 @@ public void setNoSpaceAfter(final boolean noSpaceAfter) {

public String getAttributesAsString() {
final StringBuilder sb = new StringBuilder();
for (final String attr : this.attributes) {
for (final String attr : attributes) {
sb.append((sb.length() == 0 ? "" : ", ") + attr);
}
return sb.toString();
}

@Override
public Token clone() {
final Token cloned = new Token(this.attrIdx.clone());
cloned.tags = new ArrayList<>(this.tags);
cloned.attributes = new ArrayList<>(this.attributes);
final Token cloned = new Token(attrIdx.clone());
cloned.tags = new ArrayList<>(tags);
cloned.attributes = new ArrayList<>(attributes);
cloned.id = id;
cloned.noSpaceAfter = this.noSpaceAfter;
cloned.noSpaceAfter = noSpaceAfter;
return cloned;
}

Expand All @@ -210,11 +214,11 @@ public void setAttributeIndex(final TokenAttributeIndex newAttrIdx) {
}

public boolean isWrapped() {
return this.getClass().isInstance(WrappedToken.class);
return getClass().isInstance(WrappedToken.class);
}

public TokenAttributeIndex getAttributeIndex() {
return this.attrIdx;
return attrIdx;
}

/**
Expand All @@ -225,7 +229,7 @@ public TokenAttributeIndex getAttributeIndex() {
* @return true, if successful
*/
public boolean hasBase(final String base, final boolean disambOnly) {
for (final Tag tag : this.tags) {
for (final Tag tag : tags) {
if (tag.getBase().equals(base) && (disambOnly == false || tag.getDisamb() == true)) {
return true;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package g419.liner2.cli.action;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import g419.corpus.io.reader.AbstractDocumentReader;
import g419.corpus.io.reader.ReaderFactory;
import g419.corpus.io.writer.AbstractDocumentWriter;
import g419.corpus.io.writer.WriterFactory;
import g419.corpus.structure.Document;
import g419.corpus.structure.Sentence;
import g419.corpus.structure.Token;
import g419.lib.cli.Action;
import g419.lib.cli.CommonOptions;
import io.vavr.collection.Stream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;

import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

public class ActionAnonimization extends Action {

class PhraseType {
private String phrase;
private String type;

public PhraseType(final String phrase, final String type) {
this.phrase = phrase;
this.type = type;
}

public String getPhrase() {
return phrase;
}

public void setPhrase(final String phrase) {
this.phrase = phrase;
}

public String getType() {
return type;
}

public void setType(final String type) {
this.type = type;
}
}

private final String OPTION_DICTIONARY_FILE = "d";

private String inputFile = null;
private String inputFormat = null;
private String outputFolder = null;
private String outputFormat = null;
private String dictFile = null;

public ActionAnonimization() {
super("anonim");
setDescription("anonimize according to given dictionary");
options.addOption(CommonOptions.getInputFileFormatOption());
options.addOption(CommonOptions.getInputFileNameOption());
options.addOption(CommonOptions.getOutputFileNameOption());
options.addOption(CommonOptions.getOutputFileFormatOption());
options.addOption(getDictionaryFile());
}

@Override
public void parseOptions(final CommandLine line) throws Exception {
inputFile = line.getOptionValue(CommonOptions.OPTION_INPUT_FILE);
inputFormat = line.getOptionValue(CommonOptions.OPTION_INPUT_FORMAT, "ccl");
outputFolder = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FILE);
outputFormat = line.getOptionValue(CommonOptions.OPTION_OUTPUT_FORMAT, "ccl");
dictFile = line.getOptionValue(OPTION_DICTIONARY_FILE);
}

public Option getDictionaryFile() {
return Option.builder(OPTION_DICTIONARY_FILE).longOpt("dict")
.hasArg().argName("path").desc("dictionary of phrases per document").build();
}


@Override
public void run() throws Exception {
final Map<String, List<PhraseType>> phrases = loadPhrases(Paths.get(dictFile));
getLogger().info("Number of documents with phrases to anonymize: {}", phrases.size());

try (final AbstractDocumentReader reader = ReaderFactory.get().getStreamReader(inputFile, inputFormat);
final AbstractDocumentWriter writer = WriterFactory.get().getStreamWriter(outputFolder, inputFormat)) {
reader.forEachRemaining(document -> {
final String docId = document.getName();
final List<PhraseType> documentPhrases = phrases.computeIfAbsent(docId, d -> Lists.newArrayList());
anonymizePhrases(document, documentPhrases);
writer.writeDocument(document);
});
}
}

private void anonymizePhrases(final Document d, final List<PhraseType> documentPhrases) {
getLogger().info("[{}] Number of phrases: {}", d.getName(), documentPhrases.size());
final Set<String> replaced = Sets.newHashSet();
documentPhrases.forEach(phrase -> {
if (replaced.contains(phrase.getPhrase())) {
getLogger().error("[{}] PHRASE '{}' already replaced", d.getName(), phrase.getPhrase());
} else {
final int replaceCount = anonymizePhrase(d, phrase);
if (replaceCount > 0) {
getLogger().info("[{}] PHRASE '{}' found {} times and replaced with '@{}'",
d.getName(), phrase.getPhrase(), replaceCount, phrase.getType());
replaced.add(phrase.getPhrase());
} else {
getLogger().error("[{}] PHRASE '{}' not found", d.getName(), phrase.getPhrase());
}
}
});
}

private int anonymizePhrase(final Document d, final PhraseType phrase) {
final List<List<Token>> matches = matchPhrase(d, phrase.getPhrase());
matches.forEach(m -> anonymizePhrase(m, "@" + phrase.getType()));
return matches.size();
}

private void anonymizePhrase(final List<Token> tokens, final String label) {
tokens.forEach(t -> {
t.setOrth(label);
t.getTags().forEach(tag -> tag.setBase(label));
});
}

private List<List<Token>> matchPhrase(final Document d, final String phrase) {
return d.getSentences()
.stream()
.map(s -> matchPhrase(s, phrase))
.flatMap(Collection::stream)
.collect(Collectors.toList());
}


private List<List<Token>> matchPhrase(final Sentence s, final String phrase) {
final List<Token> tokens = s.getTokens();
final List<List<Token>> matches = Lists.newArrayList();
int n = 0;
while (n < tokens.size()) {
final List<Token> match = matchPhraseAtPos(tokens, phrase, n);
if (match != null) {
matches.add(match);
n += match.size();
}
n++;
}
return matches;
}

private List<Token> matchPhraseAtPos(final List<Token> tokens, final String phrase, final int pos) {
String concat = tokens.get(pos).getOrth();
int n = pos;
while (!concat.equals(phrase) && n + 1 < tokens.size() && concat.length() < phrase.length()) {
if (!tokens.get(n).getNoSpaceAfter()) {
concat += " ";
}
concat += tokens.get(++n).getOrth();
}
if (concat.equals(phrase)) {
return Stream.rangeClosed(n, pos)
.map(tokens::get)
.collect(Collectors.toList());
} else {
return null;
}
}

public Map<String, List<PhraseType>> loadPhrases(final Path path) throws IOException {
final CSVParser csv = new CSVParser(new FileReader(path.toFile()), CSVFormat.TDF);
final Map<String, List<PhraseType>> phrases = Maps.newHashMap();
csv.forEach(record -> {
final String docId = record.get(0);
final PhraseType phraseType = new PhraseType(record.get(4), record.get(3));
phrases.computeIfAbsent(docId, d -> Lists.newArrayList()).add(phraseType);
});
return phrases;
}

}

0 comments on commit fc61ca6

Please sign in to comment.