Skip to content

Commit e05f3b1

Browse files
heeyoungleeStanford NLP
authored and
Stanford NLP
committed
Merge branch 'master' into heeyoung
1 parent 0f2b31a commit e05f3b1

File tree

389 files changed

+198870
-90362
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

389 files changed

+198870
-90362
lines changed

JavaNLP-core.eml

+9-4
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@
5757
<lib name="AppleJavaExtensions.jar" scope="COMPILE">
5858
<relative-module-cls project-related="jar://$PROJECT_DIR$/projects/core/lib/AppleJavaExtensions.jar!/"/>
5959
</lib>
60-
<lib name="JFlex.jar" scope="COMPILE">
61-
<srcroot url="jar://$MODULE_DIR$/libsrc/jflex-1.4.3-src.zip!/"/>
62-
<relative-module-src project-related="jar://$PROJECT_DIR$/projects/core/libsrc/jflex-1.4.3-src.zip!/"/>
63-
<relative-module-cls project-related="jar://$PROJECT_DIR$/projects/core/lib/JFlex.jar!/"/>
60+
<lib name="jflex-1.5.1.jar" scope="COMPILE">
61+
<srcroot url="jar://$MODULE_DIR$/libsrc/jflex-1.5.1-src.zip!/"/>
62+
<relative-module-src project-related="jar://$PROJECT_DIR$/projects/core/libsrc/jflex-1.5.1-src.zip!/"/>
63+
<relative-module-cls project-related="jar://$PROJECT_DIR$/projects/core/lib/jflex-1.5.1.jar!/"/>
6464
</lib>
6565
<lib name="xom-1.2.10.jar" scope="COMPILE">
6666
<srcroot url="file://libsrc/xom-1.2.10-src.zip"/>
@@ -99,4 +99,9 @@
9999
<lib name="javacc.jar" scope="COMPILE">
100100
<relative-module-cls project-related="jar://$PROJECT_DIR$/projects/core/lib/javacc.jar!/"/>
101101
</lib>
102+
<lib name="javax.json.jar" scope="COMPILE">
103+
<srcroot url="jar://$MODULE_DIR$/libsrc/javax.json-api-1.0-sources.jar!/"/>
104+
<relative-module-src project-related="jar://$PROJECT_DIR$/projects/core/libsrc/javax.json-api-1.0-sources.jar!/"/>
105+
<relative-module-cls project-related="jar://$PROJECT_DIR$/projects/core/lib/javax.json.jar!/"/>
106+
</lib>
102107
</component>

build.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@
6363
</classpath>
6464
</taskdef>
6565

66-
<taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
66+
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
6767
<classpath>
68-
<pathelement location="${project.core}/lib/JFlex.jar"/>
68+
<pathelement location="${project.core}/lib/jflex-1.5.1.jar"/>
6969
</classpath>
7070
</taskdef>
7171

data/edu/stanford/nlp/dcoref/expected.txt

+7-7
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@ CONLL EVAL SUMMARY (Before COREF)
22
Identification of Mentions: Recall: (12405 / 14291) 86.8% Precision: (12405 / 34910) 35.53% F1: 50.42%
33

44
CONLL EVAL SUMMARY (After COREF)
5-
METRIC muc:Coreference: Recall: (6254 / 10539) 59.34% Precision: (6254 / 10071) 62.09% F1: 60.68%
6-
METRIC bcub:Coreference: Recall: (12453.83 / 18378) 67.76% Precision: (13630.5 / 18378) 74.16% F1: 70.82%
7-
METRIC ceafm:Coreference: Recall: (10925 / 18378) 59.44% Precision: (10925 / 18378) 59.44% F1: 59.44%
8-
METRIC ceafe:Coreference: Recall: (3831.96 / 7839) 48.88% Precision: (3831.96 / 8307) 46.12% F1: 47.46%
9-
METRIC blanc:Coreference links: Recall: (25243 / 54427) 46.37% Precision: (25243 / 40582) 62.2% F1: 53.13%
10-
Non-coreference links: Recall: (931338 / 946677) 98.37% Precision: (931338 / 960522) 96.96% F1: 97.66%
11-
BLANC: Recall: (0.72 / 1) 72.37% Precision: (0.8 / 1) 79.58% F1: 75.4%
5+
METRIC muc:Coreference: Recall: (6253 / 10539) 59.33% Precision: (6253 / 10073) 62.07% F1: 60.67%
6+
METRIC bcub:Coreference: Recall: (12457.63 / 18383) 67.76% Precision: (13632.3 / 18383) 74.15% F1: 70.81%
7+
METRIC ceafm:Coreference: Recall: (10927 / 18383) 59.44% Precision: (10927 / 18383) 59.44% F1: 59.44%
8+
METRIC ceafe:Coreference: Recall: (3833.81 / 7844) 48.87% Precision: (3833.81 / 8310) 46.13% F1: 47.46%
9+
METRIC blanc:Coreference links: Recall: (25241 / 54427) 46.37% Precision: (25241 / 40586) 62.19% F1: 53.13%
10+
Non-coreference links: Recall: (931826 / 947171) 98.37% Precision: (931826 / 961012) 96.96% F1: 97.66%
11+
BLANC: Recall: (0.72 / 1) 72.37% Precision: (0.8 / 1) 79.57% F1: 75.39%
1212

1313
Final conll score ((muc+bcub+ceafe)/3) = 59.65
1414
Final score (pairwise) Precision = 0.57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
عندما كنت اطالع المواضيع الخاصة ب# جماعة الاخوان المسلمين . . وجدت شئ غريب . . وجدت المدافعين عن الجماعة . . و# ل# الاسف . . يتوجهون الى الهجوم عندما لا يستطيعون الدفاع . .
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
عندما كنت اطالع المواضيع الخاصه بجماعة الاخوان المسلمين . . وجدت شئ غريب . . وجدت المدافعين عن الجماعه . . وللاسف . . يتوجهون الى الهجوم عندما لا يستطيعون الدفاع . .
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
و# اعلن مدير شرطة عمان ان #ه اثناء اقامة الحفل و# بناء على خلفية ثارية , اقدم خالد الكحلوت ( 22 عاما ) ب# اطلاق النار على سعيد الحرازين ( 30 عاما ) فاردا #ه . و# صادف اثناء اطلاق النار وجود عادل الحرازين شقيق القتيل الذي قام ب# اطلاق النار على الجاني ف# قتل #ه .
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
واعلن مدير شرطة عمان انه اثناء اقامة الحفل وبناء على خلفية ثأرية ، اقدم خالد الكحلوت ( 22 عاما ) باطلاق النار على سعيد الحرازين ( 30 عاما ) فارداه . وصادف اثناء اطلاق النار وجود عادل الحرازين شقيق القتيل الذي قام باطلاق النار على الجاني فقتله .
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
#### NOTE: for all flags and their description, see the javadoc. Important parameters (in our experience) that you should tune for your dataset are marked with ***
2+
3+
#name for the saved files for the output of the system (useful for comparing results of different experiments with different variables etc
4+
identifier=useNERRestriction
5+
6+
#Directory where this code lives
7+
DIR=projects/core/data/edu/stanford/nlp/patterns/surface
8+
9+
outDir=SPIEDPatternsout
10+
11+
#Number of threads available on the machine
12+
numThreads=1
13+
#Use these options if you are limited by memory
14+
batchProcessSents = false
15+
numMaxSentencesPerBatchFile=100
16+
saveInvertedIndexDir=${outDir}/invertedIndex
17+
#loadInvertedIndexDir=${outDir}/invertedIndex
18+
19+
### Example for running it on presidents biographies. For more data examples, see the bottom of this file
20+
21+
#can be text. the code will tokenize it.
22+
fileFormat=text
23+
#Input file(s) (default assumed text). Can be one or more of (concatenated by comma or semi-colon): file, directory, files with regex in the filename (for example: "mydir/health-.*-processed.txt")
24+
file=${DIR}/presidents.txt
25+
26+
#to save the serialized sentences into a file - text split into sentences, processed using ner, parse etc (depending on the flags) and labeled with seed set
27+
#saveSentencesSerDir=${DIR}
28+
29+
#if you use the flag above to save the file, you can use the saved file like this
30+
#fileFormat=ser
31+
#file= ${DIR}/presidents_sents.ser
32+
33+
#We are learning names of presidential candidates, places, and other names
34+
seedWordsFiles=NAME,${DIR}/names.txt;PLACE,${DIR}/places.txt;OTHER,${DIR}/otherpeople.txt
35+
36+
#You can evaluate two ways; both presented here.
37+
evaluate=true
38+
goldEntitiesEvalFiles=NAME,${DIR}/goldnames.txt;PLACE,${DIR}/goldplaces.txt
39+
#evalFileWithGoldLabels=${DIR}/presidents_eval.txt
40+
41+
42+
#SAVE n LOAD the model (patterns and phrases) options
43+
patternsWordsDir=${outDir}/${identifier}/model/
44+
#save the learned patterns and learned words in patternsWordsDir directory
45+
savePatternsWordsDir=true
46+
#load the patterns and words from patternsWordsDir directory
47+
#loadSavedPatternsWordsDir=true
48+
49+
#false if you just want to process the text into sents but not do anything with it, or you want to use loadSavedPatternsWordsDir option. Useful for batch processing and saving text as serialized objects, then running the learning system on all the serialized objects (see saveSentences* and saveEvalSent* flags) or domain adaptation.
50+
learn=true
51+
52+
53+
#posModelPath=<if you want to use a different Stanford NLP group released POS tagger; e.g. caseless etc>
54+
55+
######## creating patterns flags ##########
56+
#***use context on the left
57+
usePreviousContext=true
58+
59+
#***use context on the right
60+
useNextContext = true
61+
62+
#***the context should be at least this long
63+
minWindow4Pattern = 2
64+
65+
#***the context can be at most this long
66+
maxWindow4Pattern = 4
67+
68+
#if the context consists of only stop words, add only if it's more than these many stop words
69+
numMinStopWordsToAdd = 3
70+
71+
#***use POS tag restriction for the target phrase
72+
usePOS4Pattern = true
73+
74+
#Ignore words {a, an, the} while matching the patterns to text (advisable true)
75+
useFillerWordsInPat = false
76+
77+
#***Specific allowed tags' initials for the target phrase for each label while creating the patterns (if not specified, every tag is acceptable to create a pattern). Tag initials can be written as N or NN or J or N,J etc. E.g.: NAME,N,J;PLACE,N. If
78+
targetAllowedTagsInitialsStr=NAME,N;OTHER,N
79+
80+
#You can save all possible patterns for all tokens in the flag allPatternsFile so you wouldn't need to calculate them everytime.
81+
computeAllPatterns = true
82+
83+
#Save or read (if computeAllPatterns is false) from here
84+
allPatternsFile= ${DIR}/${identifier}_allpatterns.ser
85+
86+
#***maximum Num of allowed words in the target phrase
87+
numWordsCompound = 3
88+
89+
#***consider patterns without the POS restricion on the target phrase
90+
addPatWithoutPOS = true
91+
92+
#Ignore common stop words occuring just before the target phrase
93+
useStopWordsBeforeTerm=false
94+
95+
#Use lemma instead of words of the context tokens
96+
useLemmaContextTokens=true
97+
98+
#make context matching lowercase (advisable)
99+
matchLowerCaseContext=true
100+
101+
#***use named entity tag (predicted using StanfordCoreNLP NER) restriction of the target phrase
102+
useTargetNERRestriction=true
103+
104+
#***If useTargetNERRestriction is true, you can give NER tags that the target phrase can take. Do not mention anything if you don't want any specific restriction
105+
targetAllowedNERs=NAME,PERSON;PLACE,LOCATION;OTHER,PERSON
106+
107+
#use named entity tag restrictions for the context tokens
108+
useContextNERRestriction=false
109+
110+
#***use the parse tag of the grandparent node as restriction (note that parent node is the POS tag)
111+
useTargetParserParentRestriction=false
112+
113+
#do not extract phrase in which any word is labeled with another class (for example, you don't wanna extract 'HIV patients' as disease)
114+
doNotExtractPhraseAnyWordLabeledOtherClass = true
115+
116+
#### matching patterns to text ######
117+
118+
#kinda ignore this flag and use it as true. for those who care this too much: for each token, we use the phrase that originally matched that token instead of the token's word
119+
useMatchingPhrase=true
120+
121+
#Use only the tokens that get matched by a pattern (advisable as false)
122+
restrictToMatched = false
123+
124+
#Label the learned words in the text (advisable as true)
125+
usePatternResultAsLabel=true
126+
127+
#remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease")
128+
removeStopWordsFromSelectedPhrases = true
129+
130+
#Do not learn phrases that have any stop word
131+
removePhrasesWithStopWords = false
132+
133+
134+
### evaluating candidate patterns
135+
136+
#***Minimum number of positive phrases a candidate pattern should extract
137+
minPosPhraseSupportForPat = 1
138+
139+
##### thresholds for selecting paterns and words #####
140+
141+
#***threshold for learning a phrase
142+
thresholdWordExtract=0.01
143+
144+
#***thrshold for learning a pattern
145+
thresholdSelectPattern = 0.01
146+
147+
#keep lowering threshold as 0.8*threshold whenever the system doesn't learn any new patterns and phrases
148+
tuneThresholdKeepRunning=false
149+
150+
#***discard phrases that do not have these many patterns extracting it
151+
thresholdNumPatternsApplied = 1
152+
153+
#***max number of words to extract in each iteration
154+
numWordsToAdd = 5
155+
156+
#***max number of words to extract in each pattern
157+
numPatterns = 5
158+
159+
#***max number of iterations
160+
numIterationsForPatterns = 8
161+
162+
#Consider words belonging to other labels as negative (advisable as true)
163+
useOtherLabelsWordsasNegative=true
164+
165+
#***Pattern scoring measure. For more details, see the paper. The options are PhEvalInPatLogP, PhEvalInPat, PosNegUnlabOdds, RlogF, RlogFPosNeg, YanGarber02, PosNegOdds, LOGREG, LOGREGlogP, RatioAll, SqrtAllRatio
166+
patternScoring=RatioAll
167+
168+
#Class to be used to score phrases. The valid options are edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures and edu.stanford.nlp.patterns.surface.ScorePhrasesLearnFeatWt
169+
phraseScorerClass=edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures
170+
171+
172+
#if you want to sqrt root the pattern score
173+
sqrtPatScore = false
174+
175+
#Phrase scoring measure; ignore.
176+
wordScoring=WEIGHTEDNORM
177+
178+
#For scoring phrases that are OOV, a score is the average of the score of individual words (instead of min, which is default)
179+
useAvgInsteadofMinPhraseScoring=true
180+
181+
#*** what all features to use to evaluate phrases. See the paper for more details on each
182+
#only if wordClassClusterFile is provided
183+
usePhraseEvalWordClass=false
184+
185+
#tf-idf scoring w.r.t to the domain
186+
usePhraseEvalDomainNgram=false
187+
188+
#use pattern weights in scoring phrases extracted by them, if usePhraseEvalPatWtByFreq is true. otherwise it's just a tfidf like score
189+
usePatternWeights=true
190+
191+
#basically patwt/log(freq), patwt = 1 if usePatternWeights is false
192+
usePhraseEvalPatWtByFreq=true
193+
194+
#if using multiple label dictionaries etc, freq of the phrase in the label dictionary vs other dictionaries
195+
usePhraseEvalSemanticOdds=true
196+
197+
#edit distance from positive entities
198+
usePhraseEvalEditDistSame=true
199+
200+
#edit distance from the negative entities
201+
usePhraseEvalEditDistOther=true
202+
203+
#if you have googlengrams, you can use googlengrams tf-idf scoring.
204+
usePhraseEvalGoogleNgram=false
205+
206+
#% of positive labeled words with the same word class (see WordClassClassifier and chris2 for more details)
207+
usePhraseEvalWordShape=true
208+
209+
210+
#These flags are not valid if patternScoring is not PhEvalInPat* . Similar meaning as for the phrase ones above
211+
usePatternEvalWordClass=false
212+
usePatternEvalGoogleNgram=false
213+
usePatternEvalSemanticOdds=true
214+
usePatternEvalEditDistSame=true
215+
usePatternEvalEditDistOther=true
216+
usePatternEvalDomainNgram=false
217+
usePatternEvalWordShape=true
218+
219+
#Options are LOG, NONE or SQRT
220+
wordFreqNorm = NONE
221+
222+
######For logging
223+
224+
#4 if you wanna print out every single thing happening in the system, 3 if you want fair amount of debug messages and justification, 2 means some debug msgs, 1 means only necessary msgs and 0 means (almost) no msgs
225+
debug = 3
226+
227+
#If you want output in which each labeled phrase has <label> </label> around it
228+
#markedOutputTextFile=markedtext.txt
229+
230+
231+
#stop words file
232+
stopWordsPatternFiles=${DIR}/stopwords.txt
233+
234+
englishWordsFiles=${stopWordsPatternFiles}
235+
commonWordsPatternFiles= ${stopWordsPatternFiles}
236+
#You can give some common words like this
237+
#commonWordsPatternFiles =${DIR}/lists/commonEngWords1k.txt
238+
239+
#If you are using Google Ngrams TF-IDF feature
240+
#googleNGramsFile=/u/nlp/scr/google-ngrams/1gms/vocab
241+
#weightDomainFreq=10
242+
243+
#below is optional; comma separated files with list of phrases that def do not belong to any of the labels
244+
#otherSemanticClassesFiles=${DIR}/nondiseases.txt
245+
246+
#The flags below are used when either LOGREG is used for patternScoring or ScorePhrasesLearnFeatWt class is used for phrase scoring
247+
#% unlabeled tokens selected as negative
248+
#perSelectRand=0.5
249+
#% negative tokens selected as negative
250+
#perSelectNeg=1
251+
252+
253+
### Example for running the code on BioMed articles and NCBI corpus (instead of the toy example above)
254+
255+
#fileFormat=text
256+
#file=${DIR}/BioMedSample
257+
#saveSentencesSerFile=${DIR}/biomed_sents.ser
258+
259+
#evalFileWithGoldLabels=${DIR}/NCBI_corpus_testing_processed.txt
260+
#saveEvalSentencesSerFile=${DIR}/ncbi_corpus_testing_sents.ser
261+
#addEvalSentsToTrain=true
262+
263+
#seedWordsFiles=disease,${DIR}/diseases.txt;nondisease,${DIR}/nondiseases.txt
264+
265+
#evaluate=true
266+
267+
#default as true, false if you want scores per token
268+
evalPerEntity=true
269+
270+
#wordClassClusterFile=${DIR}/ncbi_disease_brownclusters_200_min5.txt
271+
272+
#externalFeatureWeightsFile = ${DIR}/out/wordclass_weights
273+
274+

0 commit comments

Comments
 (0)