Skip to content

Commit 68f72aa

Browse files
committed
Add a Hungarian NER model
1 parent cfe8ee1 commit 68f72aa

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed

scripts/ner/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ german.distsim.crf.ser.gz:
2121
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german.distsim.prop -serializeTo $@ \
2222
> $(addsuffix .out, $(basename $(basename $(basename $@)))) 2>&1
2323

24+
hungarian: hungarian.crf.ser.gz
25+
26+
hungarian.crf.ser.gz:
27+
java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop hungarian.prop -serializeTo $@ > hungarian.out 2>&1
28+
29+
2430
# currently we exclude enp_DE.sbb.io, as the data has too many issues, but we could work to include it....
2531
# ,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.sbb.bio/enp_DE.sbb.io
2632

scripts/ner/hungarian.prop

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
trainFileList = /u/nlp/data/ner/hungarian/hu_combined.train.io
2+
testFile = /u/nlp/data/ner/hungarian/hu_combined.test.io
3+
serializeTo = hungarian.crf.ser.gz
4+
5+
type=crf
6+
7+
# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
8+
# distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
9+
# right options for new hgc_175m_600
10+
useDistSim = false
11+
12+
# Now using stripped 2 column files so can add extra datasets!
13+
map = word=0,answer=1
14+
15+
encoding = utf-8
16+
# saveFeatureIndexToDisk = true # now buggy but unnecessary
17+
mergeTags = false
18+
useTitle = false
19+
20+
useClassFeature=true
21+
useWord=true
22+
useNGrams=true
23+
noMidNGrams=true
24+
# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
25+
maxNGramLeng=6
26+
usePrev=true
27+
useNext=true
28+
useLongSequences=true
29+
useSequences=true
30+
usePrevSequences=true
31+
useTypeSeqs=true
32+
useTypeSeqs2=true
33+
useTypeySequences=true
34+
# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
35+
useOccurrencePatterns=true
36+
useLastRealWord=true
37+
useNextRealWord=true
38+
normalize=true
39+
# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
40+
wordShape=chris2useLC
41+
useDisjunctive=true
42+
# Width 5 works a little better than 4
43+
disjunctionWidth=5
44+
45+
maxLeft=1
46+
readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
47+
useObservedSequencesOnly=true
48+
useQN = true
49+
QNsize = 15
50+
# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
51+
# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
52+
sigma = 20
53+
54+
# For making faster (less features); changing this to 0.025 doesn't improve performance
55+
featureDiffThresh=0.05
56+
57+
# evaluateIOB=true
58+
59+
# other notes
60+
# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
61+
# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)

0 commit comments

Comments
 (0)