5
5
import edu .stanford .nlp .trees .tregex .TregexPattern ;
6
6
7
7
import java .io .*;
8
+ import java .nio .charset .StandardCharsets ;
8
9
import java .util .*;
9
10
import java .util .regex .*;
10
11
12
+ /** @author Jason Bolton */
11
13
public class GermanTreebankUDUpdater {
12
14
13
- public static LabeledScoredTreeFactory factory = new LabeledScoredTreeFactory ();
15
+ private static final LabeledScoredTreeFactory factory = new LabeledScoredTreeFactory ();
14
16
15
- public static HashMap <String ,String > wordToSplit = new HashMap <>();
17
+ private static final HashMap <String ,String > wordToSplit = new HashMap <>();
16
18
17
- public static String taggerPath = "edu/stanford/nlp/models/pos-tagger/german-ud.tagger" ;
19
+ private static final String taggerPath = "edu/stanford/nlp/models/pos-tagger/german-ud.tagger" ;
18
20
19
- public static String hyphenatedWordPatternString = "[ÄÖÜäöüẞßA-Za-z]+\\ -[ÄÖÜäöüẞßA-Za-z]+" ;
20
- public static Pattern hyphenatedWordPattern = Pattern .compile (hyphenatedWordPatternString );
21
+ private static final String hyphenatedWordPatternString = "[ÄÖÜäöüẞßA-Za-z]+-[ÄÖÜäöüẞßA-Za-z]+" ;
22
+ private static final Pattern hyphenatedWordPattern = Pattern .compile (hyphenatedWordPatternString );
21
23
22
24
static {
23
25
wordToSplit .put ("am" , "an,dem" );
@@ -70,18 +72,19 @@ public static void splitHyphenatedToken(Tree tree) {
70
72
public static Tree createTagAndWordNode (String tag , String word ) {
71
73
Tree wordNode = factory .newLeaf (word );
72
74
wordNode .setValue (word );
73
- Tree tagNode = factory .newTreeNode (tag , Arrays . asList (wordNode ));
75
+ Tree tagNode = factory .newTreeNode (tag , Collections . singletonList (wordNode ));
74
76
tagNode .setValue (tag );
75
77
return tagNode ;
76
78
}
77
79
78
80
public static void main (String [] args ) throws IOException {
79
- Reader r = new BufferedReader (new InputStreamReader (new FileInputStream (args [0 ]), "UTF-8" ));
80
- TreeReader tr = new PennTreeReader ( r , factory );
81
+ Reader r = new BufferedReader (new InputStreamReader (new FileInputStream (args [0 ]), StandardCharsets . UTF_8 ));
82
+
81
83
TreebankTagUpdater tagUpdater = new TreebankTagUpdater (taggerPath );
82
- /** iterate through trees **/
83
- Tree fullTree = tr .readTree ();
84
- while (fullTree != null ) {
84
+
85
+ /* iterate through trees */
86
+ TreeReader tr = new PennTreeReader (r , factory );
87
+ for (Tree fullTree ; (fullTree = tr .readTree ()) != null ; ) {
85
88
TregexPattern pattern ;
86
89
TregexMatcher matcher ;
87
90
// split hyphenated token
@@ -102,7 +105,7 @@ public static void main(String[] args) throws IOException {
102
105
for (int i = 0 ; i < childrenList .size () ; i ++) {
103
106
if (childrenList .get (i ).value ().equals ("APPRART-AC" )) {
104
107
String mwtWord = childrenList .get (i ).getLeaves ().get (0 ).value ();
105
- if (wordToSplit .keySet (). contains (mwtWord )) {
108
+ if (wordToSplit .containsKey (mwtWord )) {
106
109
matchTree .removeChild (i );
107
110
Tree artNKNode = createTagAndWordNode ("ART-NK" , wordToSplit .get (mwtWord ).split ("," )[1 ]);
108
111
matchTree .addChild (i ,artNKNode );
@@ -116,9 +119,6 @@ public static void main(String[] args) throws IOException {
116
119
// print updated tree
117
120
tagUpdater .tagTree (fullTree );
118
121
System .out .println (fullTree );
119
-
120
- // update to next tree
121
- fullTree = tr .readTree ();
122
122
}
123
123
}
124
124
0 commit comments