File tree Expand file tree Collapse file tree 2 files changed +67
-0
lines changed Expand file tree Collapse file tree 2 files changed +67
-0
lines changed Original file line number Diff line number Diff line change @@ -21,6 +21,12 @@ german.distsim.crf.ser.gz:
21
21
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german.distsim.prop -serializeTo $@ \
22
22
> $(addsuffix .out, $(basename $(basename $(basename $@ ) ) ) ) 2>&1
23
23
24
+ hungarian : hungarian.crf.ser.gz
25
+
26
+ hungarian.crf.ser.gz :
27
+ java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop hungarian.prop -serializeTo $@ > hungarian.out 2>&1
28
+
29
+
24
30
# currently we exclude enp_DE.sbb.io, as the data has too many issues, but we could work to include it....
25
31
# ,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.sbb.bio/enp_DE.sbb.io
26
32
Original file line number Diff line number Diff line change
1
+ trainFileList = /u/nlp/data/ner/hungarian/hu_combined.train.io
2
+ testFile = /u/nlp/data/ner/hungarian/hu_combined.test.io
3
+ serializeTo = hungarian.crf.ser.gz
4
+
5
+ type=crf
6
+
7
+ # distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
8
+ # distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
9
+ # right options for new hgc_175m_600
10
+ useDistSim = false
11
+
12
+ # Now using stripped 2 column files so can add extra datasets!
13
+ map = word=0,answer=1
14
+
15
+ encoding = utf-8
16
+ # saveFeatureIndexToDisk = true # now buggy but unnecessary
17
+ mergeTags = false
18
+ useTitle = false
19
+
20
+ useClassFeature=true
21
+ useWord=true
22
+ useNGrams=true
23
+ noMidNGrams=true
24
+ # Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
25
+ maxNGramLeng=6
26
+ usePrev=true
27
+ useNext=true
28
+ useLongSequences=true
29
+ useSequences=true
30
+ usePrevSequences=true
31
+ useTypeSeqs=true
32
+ useTypeSeqs2=true
33
+ useTypeySequences=true
34
+ # Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
35
+ useOccurrencePatterns=true
36
+ useLastRealWord=true
37
+ useNextRealWord=true
38
+ normalize=true
39
+ # using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
40
+ wordShape=chris2useLC
41
+ useDisjunctive=true
42
+ # Width 5 works a little better than 4
43
+ disjunctionWidth=5
44
+
45
+ maxLeft=1
46
+ readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
47
+ useObservedSequencesOnly=true
48
+ useQN = true
49
+ QNsize = 15
50
+ # sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
51
+ # On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
52
+ sigma = 20
53
+
54
+ # For making faster (less features); changing this to 0.025 doesn't improve performance
55
+ featureDiffThresh=0.05
56
+
57
+ # evaluateIOB=true
58
+
59
+ # other notes
60
+ # even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
61
+ # adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)
You can’t perform that action at this time.
0 commit comments