diff --git a/README b/README index 5fd27cb..fc5abfb 100644 --- a/README +++ b/README @@ -8,9 +8,7 @@ Unzip the distribution archive into GATE/plugins or to the directory of your cho Usage The plugins contains 3 Processing resources : -- TrainingCorpusCreator : generates a lexicon + raw file in the specified directory. See https://code.google.com/p/textclassification/ for instructions on -how to generate a vector file and model from a raw file. +- TrainingCorpusCreator : generates a lexicon + raw file + training vector in the specified directory. See https://code.google.com/p/textclassification/ for instructions on +how to generate a model file using the base text classification API or http://www.csie.ntu.edu.tw/~cjlin/libsvm/ on how to use directly the libsvm tools over the vector file. - ClassifierPR : takes a model and lexicon to classify the annotations specified in textAnnotationType - NGram maker : generates ngrams that can be used as input for the corpus generation or classification - - diff --git a/build.properties b/build.properties index b5e2871..061c32c 100644 --- a/build.properties +++ b/build.properties @@ -1,4 +1,4 @@ -version=1.1-dev +version=1.2-dev organisation=com.digitalpebble plugin.name=TextClassificationPlugin dist.dir=distrib diff --git a/creole.xml b/creole.xml index 99cdc66..0c6e12b 100644 --- a/creole.xml +++ b/creole.xml @@ -5,8 +5,8 @@ TrainingCorpusCreator com.digitalpebble.gate.textclassification.TrainingCorpusCreatorPR - TextClassificationPlugin-1.1-dev.jar - lib/textclassification-1.4.1-SNAPSHOT.jar + TextClassificationPlugin-1.2-dev.jar + lib/textclassification-1.6.1-SNAPSHOT.jar lib/liblinear-1.51-with-deps.jar gate.Document java.lang.String @@ -17,6 +17,10 @@ java.lang.String java.net.URL java.lang.Boolean + java.lang.Integer + java.lang.Integer + java.lang.Integer + java.lang.Boolean /dipe.png @@ -25,8 +29,8 @@ Classifier com.digitalpebble.gate.textclassification.ClassifierPR - TextClassificationPlugin-1.1-dev.jar - lib/textclassification-1.4.1-SNAPSHOT.jar + TextClassificationPlugin-1.2-dev.jar + lib/textclassification-1.6.1-SNAPSHOT.jar lib/liblinear-1.51-with-deps.jar gate.Document java.lang.String @@ -43,8 +47,8 @@ NGram maker com.digitalpebble.gate.textclassification.NGram - TextClassificationPlugin-1.1-dev.jar - lib/textclassification-1.4.1-SNAPSHOT.jar + TextClassificationPlugin-1.2-dev.jar + lib/textclassification-1.6.1-SNAPSHOT.jar lib/liblinear-1.51-with-deps.jar gate.Document java.lang.String diff --git a/lib/libsvm-3.0.jar b/lib/libsvm-3.0.jar new file mode 100644 index 0000000..6ed1a25 Binary files /dev/null and b/lib/libsvm-3.0.jar differ diff --git a/lib/textclassification-1.4.1-SNAPSHOT.jar b/lib/textclassification-1.4.1-SNAPSHOT.jar deleted file mode 100644 index 3337b99..0000000 Binary files a/lib/textclassification-1.4.1-SNAPSHOT.jar and /dev/null differ diff --git a/lib/textclassification-1.6.1-SNAPSHOT.jar b/lib/textclassification-1.6.1-SNAPSHOT.jar new file mode 100644 index 0000000..620e18a Binary files /dev/null and b/lib/textclassification-1.6.1-SNAPSHOT.jar differ diff --git a/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java b/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java index 4b9350f..d735e5e 100644 --- a/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java +++ b/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java @@ -32,13 +32,20 @@ import java.util.Collections; import java.util.Iterator; import java.util.List; +import java.util.Map; import com.digitalpebble.classification.Document; import com.digitalpebble.classification.FileTrainingCorpus; import com.digitalpebble.classification.Learner; +import com.digitalpebble.classification.Lexicon; import com.digitalpebble.classification.Parameters; +import com.digitalpebble.classification.TrainingCorpus; import com.digitalpebble.classification.Parameters.WeightingMethod; import com.digitalpebble.classification.RAMTrainingCorpus; +import com.digitalpebble.classification.util.CorpusUtils; +import com.digitalpebble.classification.util.scorers.AttributeScorer; +import com.digitalpebble.classification.util.scorers.logLikelihoodAttributeScorer; +import com.digitalpebble.classification.libsvm.Utils; public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser implements ProcessingResource { @@ -66,11 +73,32 @@ public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser * ComponentAnnotationValue (e.g. form) The feature value used for the ML attributes. */ private String attributeAnnotationValue; + /** + * Directory where lexicon, vector and raw model will be saved + */ private URL directory; - private FileTrainingCorpus trainingcorpus; + /** + * FEature weighting scheme + */ private String weightingScheme; + + private Integer minFreq=1; + private Integer maxFreq=Integer.MAX_VALUE; + /** + * Run after prunning according to min and max freq + */ + private Integer keepNBestAttributes =0; + /** + * Compact the lexicon after prunning + */ + Boolean compactLexicon =true; + private Boolean reinitCorpus = true; + + private FileTrainingCorpus trainingcorpus; private String implementation = Learner.LibSVMModelCreator; + String pathDirectory; + private String libsvmVectorPath; /* * this method gets called whenever an object of this class is created @@ -92,8 +120,12 @@ public Resource init() throws ResourceInstantiationException { } // initializes the modelCreator - String pathDirectory = new File(URI.create(directory.toExternalForm())) + pathDirectory = new File(URI.create(directory.toExternalForm())) .getAbsolutePath(); + if(libsvmVectorPath == null || libsvmVectorPath.isEmpty()){ + libsvmVectorPath = pathDirectory+File.separator+"vector"; + } + try { this.creator = Learner.getLearner(pathDirectory, implementation, @@ -199,7 +231,28 @@ public void execute() throws ExecutionException { .methodFromString(getWeightingScheme()); this.creator.setMethod(method); trainingcorpus.close(); + Lexicon lexicon = creator.getLexicon(); creator.saveLexicon(); + //prune by frequency + lexicon.pruneTermsDocFreq(minFreq, maxFreq); + //further keep only the N best attributes + if (keepNBestAttributes >0) { + AttributeScorer scorer = logLikelihoodAttributeScorer.getScorer( + trainingcorpus, lexicon); + lexicon.setAttributeScorer(scorer); + lexicon.applyAttributeFilter(scorer, keepNBestAttributes); + } + // change the indices of the attributes to remove + // gaps between them + Map equiv = null; + if (compactLexicon){ + // create a new Lexicon object + equiv = lexicon.compact(); + } + // save the modified lexicon file + lexicon.saveToFile(this.pathDirectory+"lexicon.compact"); + Utils.writeExamples(trainingcorpus,lexicon, + this.libsvmVectorPath, equiv); } catch (Exception e) { throw new ExecutionException(e); } finally { @@ -304,4 +357,44 @@ public void setImplementation(String implementation) { this.implementation = implementation; } + public Learner getCreator() { + return creator; + } + + public void setCreator(Learner creator) { + this.creator = creator; + } + + public Integer getMinFreq() { + return minFreq; + } + + public void setMinFreq(Integer minFreq) { + this.minFreq = minFreq; + } + + public Integer getMaxFreq() { + return maxFreq; + } + + public void setMaxFreq(Integer maxFreq) { + this.maxFreq = maxFreq; + } + + public Integer getKeepNBestAttributes() { + return keepNBestAttributes; + } + + public void setKeepNBestAttributes(Integer keepNBestAttributes) { + this.keepNBestAttributes = keepNBestAttributes; + } + + public Boolean getCompactLexicon() { + return compactLexicon; + } + + public void setCompactLexicon(Boolean compactLexicon) { + this.compactLexicon = compactLexicon; + } + } \ No newline at end of file