diff --git a/README b/README
index 5fd27cb..fc5abfb 100644
--- a/README
+++ b/README
@@ -8,9 +8,7 @@ Unzip the distribution archive into GATE/plugins or to the directory of your cho
Usage
The plugins contains 3 Processing resources :
-- TrainingCorpusCreator : generates a lexicon + raw file in the specified directory. See https://code.google.com/p/textclassification/ for instructions on
-how to generate a vector file and model from a raw file.
+- TrainingCorpusCreator : generates a lexicon + raw file + training vector in the specified directory. See https://code.google.com/p/textclassification/ for instructions on
+how to generate a model file using the base text classification API or http://www.csie.ntu.edu.tw/~cjlin/libsvm/ on how to use directly the libsvm tools over the vector file.
- ClassifierPR : takes a model and lexicon to classify the annotations specified in textAnnotationType
- NGram maker : generates ngrams that can be used as input for the corpus generation or classification
-
-
diff --git a/build.properties b/build.properties
index b5e2871..061c32c 100644
--- a/build.properties
+++ b/build.properties
@@ -1,4 +1,4 @@
-version=1.1-dev
+version=1.2-dev
organisation=com.digitalpebble
plugin.name=TextClassificationPlugin
dist.dir=distrib
diff --git a/creole.xml b/creole.xml
index 99cdc66..0c6e12b 100644
--- a/creole.xml
+++ b/creole.xml
@@ -5,8 +5,8 @@
TrainingCorpusCreator
com.digitalpebble.gate.textclassification.TrainingCorpusCreatorPR
- TextClassificationPlugin-1.1-dev.jar
- lib/textclassification-1.4.1-SNAPSHOT.jar
+ TextClassificationPlugin-1.2-dev.jar
+ lib/textclassification-1.6.1-SNAPSHOT.jar
lib/liblinear-1.51-with-deps.jar
gate.Document
java.lang.String
@@ -17,6 +17,10 @@
java.lang.String
java.net.URL
java.lang.Boolean
+ java.lang.Integer
+ java.lang.Integer
+ java.lang.Integer
+ java.lang.Boolean
/dipe.png
@@ -25,8 +29,8 @@
Classifier
com.digitalpebble.gate.textclassification.ClassifierPR
- TextClassificationPlugin-1.1-dev.jar
- lib/textclassification-1.4.1-SNAPSHOT.jar
+ TextClassificationPlugin-1.2-dev.jar
+ lib/textclassification-1.6.1-SNAPSHOT.jar
lib/liblinear-1.51-with-deps.jar
gate.Document
java.lang.String
@@ -43,8 +47,8 @@
NGram maker
com.digitalpebble.gate.textclassification.NGram
- TextClassificationPlugin-1.1-dev.jar
- lib/textclassification-1.4.1-SNAPSHOT.jar
+ TextClassificationPlugin-1.2-dev.jar
+ lib/textclassification-1.6.1-SNAPSHOT.jar
lib/liblinear-1.51-with-deps.jar
gate.Document
java.lang.String
diff --git a/lib/libsvm-3.0.jar b/lib/libsvm-3.0.jar
new file mode 100644
index 0000000..6ed1a25
Binary files /dev/null and b/lib/libsvm-3.0.jar differ
diff --git a/lib/textclassification-1.4.1-SNAPSHOT.jar b/lib/textclassification-1.4.1-SNAPSHOT.jar
deleted file mode 100644
index 3337b99..0000000
Binary files a/lib/textclassification-1.4.1-SNAPSHOT.jar and /dev/null differ
diff --git a/lib/textclassification-1.6.1-SNAPSHOT.jar b/lib/textclassification-1.6.1-SNAPSHOT.jar
new file mode 100644
index 0000000..620e18a
Binary files /dev/null and b/lib/textclassification-1.6.1-SNAPSHOT.jar differ
diff --git a/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java b/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java
index 4b9350f..d735e5e 100644
--- a/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java
+++ b/src/com/digitalpebble/gate/textclassification/TrainingCorpusCreatorPR.java
@@ -32,13 +32,20 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import com.digitalpebble.classification.Document;
import com.digitalpebble.classification.FileTrainingCorpus;
import com.digitalpebble.classification.Learner;
+import com.digitalpebble.classification.Lexicon;
import com.digitalpebble.classification.Parameters;
+import com.digitalpebble.classification.TrainingCorpus;
import com.digitalpebble.classification.Parameters.WeightingMethod;
import com.digitalpebble.classification.RAMTrainingCorpus;
+import com.digitalpebble.classification.util.CorpusUtils;
+import com.digitalpebble.classification.util.scorers.AttributeScorer;
+import com.digitalpebble.classification.util.scorers.logLikelihoodAttributeScorer;
+import com.digitalpebble.classification.libsvm.Utils;
public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser
implements ProcessingResource {
@@ -66,11 +73,32 @@ public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser
* ComponentAnnotationValue (e.g. form) The feature value used for the ML attributes.
*/
private String attributeAnnotationValue;
+ /**
+ * Directory where lexicon, vector and raw model will be saved
+ */
private URL directory;
- private FileTrainingCorpus trainingcorpus;
+ /**
+ * FEature weighting scheme
+ */
private String weightingScheme;
+
+ private Integer minFreq=1;
+ private Integer maxFreq=Integer.MAX_VALUE;
+ /**
+ * Run after prunning according to min and max freq
+ */
+ private Integer keepNBestAttributes =0;
+ /**
+ * Compact the lexicon after prunning
+ */
+ Boolean compactLexicon =true;
+
private Boolean reinitCorpus = true;
+
+ private FileTrainingCorpus trainingcorpus;
private String implementation = Learner.LibSVMModelCreator;
+ String pathDirectory;
+ private String libsvmVectorPath;
/*
* this method gets called whenever an object of this class is created
@@ -92,8 +120,12 @@ public Resource init() throws ResourceInstantiationException {
}
// initializes the modelCreator
- String pathDirectory = new File(URI.create(directory.toExternalForm()))
+ pathDirectory = new File(URI.create(directory.toExternalForm()))
.getAbsolutePath();
+ if(libsvmVectorPath == null || libsvmVectorPath.isEmpty()){
+ libsvmVectorPath = pathDirectory+File.separator+"vector";
+ }
+
try {
this.creator = Learner.getLearner(pathDirectory, implementation,
@@ -199,7 +231,28 @@ public void execute() throws ExecutionException {
.methodFromString(getWeightingScheme());
this.creator.setMethod(method);
trainingcorpus.close();
+ Lexicon lexicon = creator.getLexicon();
creator.saveLexicon();
+ //prune by frequency
+ lexicon.pruneTermsDocFreq(minFreq, maxFreq);
+ //further keep only the N best attributes
+ if (keepNBestAttributes >0) {
+ AttributeScorer scorer = logLikelihoodAttributeScorer.getScorer(
+ trainingcorpus, lexicon);
+ lexicon.setAttributeScorer(scorer);
+ lexicon.applyAttributeFilter(scorer, keepNBestAttributes);
+ }
+ // change the indices of the attributes to remove
+ // gaps between them
+ Map equiv = null;
+ if (compactLexicon){
+ // create a new Lexicon object
+ equiv = lexicon.compact();
+ }
+ // save the modified lexicon file
+ lexicon.saveToFile(this.pathDirectory+"lexicon.compact");
+ Utils.writeExamples(trainingcorpus,lexicon,
+ this.libsvmVectorPath, equiv);
} catch (Exception e) {
throw new ExecutionException(e);
} finally {
@@ -304,4 +357,44 @@ public void setImplementation(String implementation) {
this.implementation = implementation;
}
+ public Learner getCreator() {
+ return creator;
+ }
+
+ public void setCreator(Learner creator) {
+ this.creator = creator;
+ }
+
+ public Integer getMinFreq() {
+ return minFreq;
+ }
+
+ public void setMinFreq(Integer minFreq) {
+ this.minFreq = minFreq;
+ }
+
+ public Integer getMaxFreq() {
+ return maxFreq;
+ }
+
+ public void setMaxFreq(Integer maxFreq) {
+ this.maxFreq = maxFreq;
+ }
+
+ public Integer getKeepNBestAttributes() {
+ return keepNBestAttributes;
+ }
+
+ public void setKeepNBestAttributes(Integer keepNBestAttributes) {
+ this.keepNBestAttributes = keepNBestAttributes;
+ }
+
+ public Boolean getCompactLexicon() {
+ return compactLexicon;
+ }
+
+ public void setCompactLexicon(Boolean compactLexicon) {
+ this.compactLexicon = compactLexicon;
+ }
+
}
\ No newline at end of file