|
| 1 | +#### NOTE: for all flags and their description, see the javadoc. Important parameters (in our experience) that you should tune for your dataset are marked with *** |
| 2 | + |
| 3 | +#name for the saved files for the output of the system (useful for comparing results of different experiments with different variables etc |
| 4 | +identifier=useNERRestriction |
| 5 | + |
| 6 | +#Directory where this code lives |
| 7 | +DIR=projects/core/data/edu/stanford/nlp/patterns/surface |
| 8 | + |
| 9 | +outDir=SPIEDPatternsout |
| 10 | + |
| 11 | +#Number of threads available on the machine |
| 12 | +numThreads=1 |
| 13 | +#Use these options if you are limited by memory |
| 14 | +batchProcessSents = false |
| 15 | +numMaxSentencesPerBatchFile=100 |
| 16 | +saveInvertedIndexDir=${outDir}/invertedIndex |
| 17 | +#loadInvertedIndexDir=${outDir}/invertedIndex |
| 18 | + |
| 19 | +### Example for running it on presidents biographies. For more data examples, see the bottom of this file |
| 20 | + |
| 21 | +#can be text. the code will tokenize it. |
| 22 | +fileFormat=text |
| 23 | +#Input file(s) (default assumed text). Can be one or more of (concatenated by comma or semi-colon): file, directory, files with regex in the filename (for example: "mydir/health-.*-processed.txt") |
| 24 | +file=${DIR}/presidents.txt |
| 25 | + |
| 26 | +#to save the serialized sentences into a file - text split into sentences, processed using ner, parse etc (depending on the flags) and labeled with seed set |
| 27 | +#saveSentencesSerDir=${DIR} |
| 28 | + |
| 29 | +#if you use the flag above to save the file, you can use the saved file like this |
| 30 | +#fileFormat=ser |
| 31 | +#file= ${DIR}/presidents_sents.ser |
| 32 | + |
| 33 | +#We are learning names of presidential candidates, places, and other names |
| 34 | +seedWordsFiles=NAME,${DIR}/names.txt;PLACE,${DIR}/places.txt;OTHER,${DIR}/otherpeople.txt |
| 35 | + |
| 36 | +#You can evaluate two ways; both presented here. |
| 37 | +evaluate=true |
| 38 | +goldEntitiesEvalFiles=NAME,${DIR}/goldnames.txt;PLACE,${DIR}/goldplaces.txt |
| 39 | +#evalFileWithGoldLabels=${DIR}/presidents_eval.txt |
| 40 | + |
| 41 | + |
| 42 | +#SAVE n LOAD the model (patterns and phrases) options |
| 43 | +patternsWordsDir=${outDir}/${identifier}/model/ |
| 44 | +#save the learned patterns and learned words in patternsWordsDir directory |
| 45 | +savePatternsWordsDir=true |
| 46 | +#load the patterns and words from patternsWordsDir directory |
| 47 | +#loadSavedPatternsWordsDir=true |
| 48 | + |
| 49 | +#false if you just want to process the text into sents but not do anything with it, or you want to use loadSavedPatternsWordsDir option. Useful for batch processing and saving text as serialized objects, then running the learning system on all the serialized objects (see saveSentences* and saveEvalSent* flags) or domain adaptation. |
| 50 | +learn=true |
| 51 | + |
| 52 | + |
| 53 | +#posModelPath=<if you want to use a different Stanford NLP group released POS tagger; e.g. caseless etc> |
| 54 | + |
| 55 | +######## creating patterns flags ########## |
| 56 | +#***use context on the left |
| 57 | +usePreviousContext=true |
| 58 | + |
| 59 | +#***use context on the right |
| 60 | +useNextContext = true |
| 61 | + |
| 62 | +#***the context should be at least this long |
| 63 | +minWindow4Pattern = 2 |
| 64 | + |
| 65 | +#***the context can be at most this long |
| 66 | +maxWindow4Pattern = 4 |
| 67 | + |
| 68 | +#if the context consists of only stop words, add only if it's more than these many stop words |
| 69 | +numMinStopWordsToAdd = 3 |
| 70 | + |
| 71 | +#***use POS tag restriction for the target phrase |
| 72 | +usePOS4Pattern = true |
| 73 | + |
| 74 | +#Ignore words {a, an, the} while matching the patterns to text (advisable true) |
| 75 | +useFillerWordsInPat = false |
| 76 | + |
| 77 | +#***Specific allowed tags' initials for the target phrase for each label while creating the patterns (if not specified, every tag is acceptable to create a pattern). Tag initials can be written as N or NN or J or N,J etc. E.g.: NAME,N,J;PLACE,N. If |
| 78 | +targetAllowedTagsInitialsStr=NAME,N;OTHER,N |
| 79 | + |
| 80 | +#You can save all possible patterns for all tokens in the flag allPatternsFile so you wouldn't need to calculate them everytime. |
| 81 | +computeAllPatterns = true |
| 82 | + |
| 83 | +#Save or read (if computeAllPatterns is false) from here |
| 84 | +allPatternsFile= ${DIR}/${identifier}_allpatterns.ser |
| 85 | + |
| 86 | +#***maximum Num of allowed words in the target phrase |
| 87 | +numWordsCompound = 3 |
| 88 | + |
| 89 | +#***consider patterns without the POS restricion on the target phrase |
| 90 | +addPatWithoutPOS = true |
| 91 | + |
| 92 | +#Ignore common stop words occuring just before the target phrase |
| 93 | +useStopWordsBeforeTerm=false |
| 94 | + |
| 95 | +#Use lemma instead of words of the context tokens |
| 96 | +useLemmaContextTokens=true |
| 97 | + |
| 98 | +#make context matching lowercase (advisable) |
| 99 | +matchLowerCaseContext=true |
| 100 | + |
| 101 | +#***use named entity tag (predicted using StanfordCoreNLP NER) restriction of the target phrase |
| 102 | +useTargetNERRestriction=true |
| 103 | + |
| 104 | +#***If useTargetNERRestriction is true, you can give NER tags that the target phrase can take. Do not mention anything if you don't want any specific restriction |
| 105 | +targetAllowedNERs=NAME,PERSON;PLACE,LOCATION;OTHER,PERSON |
| 106 | + |
| 107 | +#use named entity tag restrictions for the context tokens |
| 108 | +useContextNERRestriction=false |
| 109 | + |
| 110 | +#***use the parse tag of the grandparent node as restriction (note that parent node is the POS tag) |
| 111 | +useTargetParserParentRestriction=false |
| 112 | + |
| 113 | +#do not extract phrase in which any word is labeled with another class (for example, you don't wanna extract 'HIV patients' as disease) |
| 114 | +doNotExtractPhraseAnyWordLabeledOtherClass = true |
| 115 | + |
| 116 | +#### matching patterns to text ###### |
| 117 | + |
| 118 | +#kinda ignore this flag and use it as true. for those who care this too much: for each token, we use the phrase that originally matched that token instead of the token's word |
| 119 | +useMatchingPhrase=true |
| 120 | + |
| 121 | +#Use only the tokens that get matched by a pattern (advisable as false) |
| 122 | +restrictToMatched = false |
| 123 | + |
| 124 | +#Label the learned words in the text (advisable as true) |
| 125 | +usePatternResultAsLabel=true |
| 126 | + |
| 127 | +#remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease") |
| 128 | +removeStopWordsFromSelectedPhrases = true |
| 129 | + |
| 130 | +#Do not learn phrases that have any stop word |
| 131 | +removePhrasesWithStopWords = false |
| 132 | + |
| 133 | + |
| 134 | +### evaluating candidate patterns |
| 135 | + |
| 136 | +#***Minimum number of positive phrases a candidate pattern should extract |
| 137 | +minPosPhraseSupportForPat = 1 |
| 138 | + |
| 139 | +##### thresholds for selecting paterns and words ##### |
| 140 | + |
| 141 | +#***threshold for learning a phrase |
| 142 | +thresholdWordExtract=0.01 |
| 143 | + |
| 144 | +#***thrshold for learning a pattern |
| 145 | +thresholdSelectPattern = 0.01 |
| 146 | + |
| 147 | +#keep lowering threshold as 0.8*threshold whenever the system doesn't learn any new patterns and phrases |
| 148 | +tuneThresholdKeepRunning=false |
| 149 | + |
| 150 | +#***discard phrases that do not have these many patterns extracting it |
| 151 | +thresholdNumPatternsApplied = 1 |
| 152 | + |
| 153 | +#***max number of words to extract in each iteration |
| 154 | +numWordsToAdd = 5 |
| 155 | + |
| 156 | +#***max number of words to extract in each pattern |
| 157 | +numPatterns = 5 |
| 158 | + |
| 159 | +#***max number of iterations |
| 160 | +numIterationsForPatterns = 8 |
| 161 | + |
| 162 | +#Consider words belonging to other labels as negative (advisable as true) |
| 163 | +useOtherLabelsWordsasNegative=true |
| 164 | + |
| 165 | +#***Pattern scoring measure. For more details, see the paper. The options are PhEvalInPatLogP, PhEvalInPat, PosNegUnlabOdds, RlogF, RlogFPosNeg, YanGarber02, PosNegOdds, LOGREG, LOGREGlogP, RatioAll, SqrtAllRatio |
| 166 | +patternScoring=RatioAll |
| 167 | + |
| 168 | +#Class to be used to score phrases. The valid options are edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures and edu.stanford.nlp.patterns.surface.ScorePhrasesLearnFeatWt |
| 169 | +phraseScorerClass=edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures |
| 170 | + |
| 171 | + |
| 172 | +#if you want to sqrt root the pattern score |
| 173 | +sqrtPatScore = false |
| 174 | + |
| 175 | +#Phrase scoring measure; ignore. |
| 176 | +wordScoring=WEIGHTEDNORM |
| 177 | + |
| 178 | +#For scoring phrases that are OOV, a score is the average of the score of individual words (instead of min, which is default) |
| 179 | +useAvgInsteadofMinPhraseScoring=true |
| 180 | + |
| 181 | +#*** what all features to use to evaluate phrases. See the paper for more details on each |
| 182 | +#only if wordClassClusterFile is provided |
| 183 | +usePhraseEvalWordClass=false |
| 184 | + |
| 185 | +#tf-idf scoring w.r.t to the domain |
| 186 | +usePhraseEvalDomainNgram=false |
| 187 | + |
| 188 | +#use pattern weights in scoring phrases extracted by them, if usePhraseEvalPatWtByFreq is true. otherwise it's just a tfidf like score |
| 189 | +usePatternWeights=true |
| 190 | + |
| 191 | +#basically patwt/log(freq), patwt = 1 if usePatternWeights is false |
| 192 | +usePhraseEvalPatWtByFreq=true |
| 193 | + |
| 194 | +#if using multiple label dictionaries etc, freq of the phrase in the label dictionary vs other dictionaries |
| 195 | +usePhraseEvalSemanticOdds=true |
| 196 | + |
| 197 | +#edit distance from positive entities |
| 198 | +usePhraseEvalEditDistSame=true |
| 199 | + |
| 200 | +#edit distance from the negative entities |
| 201 | +usePhraseEvalEditDistOther=true |
| 202 | + |
| 203 | +#if you have googlengrams, you can use googlengrams tf-idf scoring. |
| 204 | +usePhraseEvalGoogleNgram=false |
| 205 | + |
| 206 | +#% of positive labeled words with the same word class (see WordClassClassifier and chris2 for more details) |
| 207 | +usePhraseEvalWordShape=true |
| 208 | + |
| 209 | + |
| 210 | +#These flags are not valid if patternScoring is not PhEvalInPat* . Similar meaning as for the phrase ones above |
| 211 | +usePatternEvalWordClass=false |
| 212 | +usePatternEvalGoogleNgram=false |
| 213 | +usePatternEvalSemanticOdds=true |
| 214 | +usePatternEvalEditDistSame=true |
| 215 | +usePatternEvalEditDistOther=true |
| 216 | +usePatternEvalDomainNgram=false |
| 217 | +usePatternEvalWordShape=true |
| 218 | + |
| 219 | +#Options are LOG, NONE or SQRT |
| 220 | +wordFreqNorm = NONE |
| 221 | + |
| 222 | +######For logging |
| 223 | + |
| 224 | +#4 if you wanna print out every single thing happening in the system, 3 if you want fair amount of debug messages and justification, 2 means some debug msgs, 1 means only necessary msgs and 0 means (almost) no msgs |
| 225 | +debug = 3 |
| 226 | + |
| 227 | +#If you want output in which each labeled phrase has <label> </label> around it |
| 228 | +#markedOutputTextFile=markedtext.txt |
| 229 | + |
| 230 | + |
| 231 | +#stop words file |
| 232 | +stopWordsPatternFiles=${DIR}/stopwords.txt |
| 233 | + |
| 234 | +englishWordsFiles=${stopWordsPatternFiles} |
| 235 | +commonWordsPatternFiles= ${stopWordsPatternFiles} |
| 236 | +#You can give some common words like this |
| 237 | +#commonWordsPatternFiles =${DIR}/lists/commonEngWords1k.txt |
| 238 | + |
| 239 | +#If you are using Google Ngrams TF-IDF feature |
| 240 | +#googleNGramsFile=/u/nlp/scr/google-ngrams/1gms/vocab |
| 241 | +#weightDomainFreq=10 |
| 242 | + |
| 243 | +#below is optional; comma separated files with list of phrases that def do not belong to any of the labels |
| 244 | +#otherSemanticClassesFiles=${DIR}/nondiseases.txt |
| 245 | + |
| 246 | +#The flags below are used when either LOGREG is used for patternScoring or ScorePhrasesLearnFeatWt class is used for phrase scoring |
| 247 | +#% unlabeled tokens selected as negative |
| 248 | +#perSelectRand=0.5 |
| 249 | +#% negative tokens selected as negative |
| 250 | +#perSelectNeg=1 |
| 251 | + |
| 252 | + |
| 253 | +### Example for running the code on BioMed articles and NCBI corpus (instead of the toy example above) |
| 254 | + |
| 255 | +#fileFormat=text |
| 256 | +#file=${DIR}/BioMedSample |
| 257 | +#saveSentencesSerFile=${DIR}/biomed_sents.ser |
| 258 | + |
| 259 | +#evalFileWithGoldLabels=${DIR}/NCBI_corpus_testing_processed.txt |
| 260 | +#saveEvalSentencesSerFile=${DIR}/ncbi_corpus_testing_sents.ser |
| 261 | +#addEvalSentsToTrain=true |
| 262 | + |
| 263 | +#seedWordsFiles=disease,${DIR}/diseases.txt;nondisease,${DIR}/nondiseases.txt |
| 264 | + |
| 265 | +#evaluate=true |
| 266 | + |
| 267 | +#default as true, false if you want scores per token |
| 268 | +evalPerEntity=true |
| 269 | + |
| 270 | +#wordClassClusterFile=${DIR}/ncbi_disease_brownclusters_200_min5.txt |
| 271 | + |
| 272 | +#externalFeatureWeightsFile = ${DIR}/out/wordclass_weights |
| 273 | + |
| 274 | + |
0 commit comments