In [1]:

    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');
    var sparkConf = new SparkConf().setAppName("JavaScript SimpleTextClassificationPipeline");
    var sc = new SparkContext(sparkConf);


In [2]:
    var SQLContext = require('eclairjs/sql/SQLContext');


In [3]:
    var Tokenizer = require('eclairjs/ml/feature/Tokenizer');
    var HashingTF = require('eclairjs/ml/feature/HashingTF');
    var LogisticRegression = require('eclairjs/ml/classification/LogisticRegression');
    var Pipeline = require('eclairjs/ml/Pipeline');
    var PipelineModel = require('eclairjs/ml/PipelineModel');
    var PipelineStage = require('eclairjs/ml/PipelineStage');
    var StructType = require('eclairjs/sql/types/StructType');
    var StructField = require('eclairjs/sql/types/StructField');
    var DataTypes = require('eclairjs/sql/types').DataTypes;
    var Metadata = require('eclairjs/sql/types/Metadata');
    var RowFactory = require('eclairjs/sql/RowFactory');


In [4]:
    var sqlContext = new SQLContext(sc);


In [5]:
function LabeledDocument(id, text, label)
   {
    this.id=id;
    this.text=text;
    this.label=label;
   }


In [6]:
function Document(id, text)
   {
    this.id=id;
    this.text=text;
    }


 Prepare training documents, which are labeled.


In [7]:
    var localTraining = [
      new LabeledDocument(0 , "a b c d e spark", 1.0),
      new LabeledDocument(1 , "b d", 0.0),
      new LabeledDocument(2 , "spark f g h", 1.0),
      new LabeledDocument(3 , "hadoop mapreduce", 0.0)];
    var training = sqlContext.createDataFrameFromJson(sc.parallelize(localTraining), {
        id:"Integer",
        text:"String",
        label:"Double"
    });


 Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.


In [8]:
    var tokenizer = new Tokenizer()
      .setInputCol("text")
     .setOutputCol("words");


In [9]:
    var hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol())
      .setOutputCol("features");
    var lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001);
    var pipeline = new Pipeline()
      .setStages([ tokenizer, hashingTF, lr]);


 Fit the pipeline to training documents.


In [10]:
    var model = pipeline.fit(training);


 Prepare test documents, which are unlabeled.


In [11]:
    localTest = [
      new Document(4, "spark i j k"),
      new Document(5, "l m n"),
      new Document(6, "spark hadoop spark"),
      new Document(7, "apache hadoop")];
    var test = sqlContext.createDataFrameFromJson(sc.parallelize(localTest), {
        id:"Integer",
        text:"String"
    });


 Make predictions on test documents.


In [12]:
    var predictions = model.transform(test);
    var rows = predictions.select("id", "text", "probability", "prediction").collect();
var result = rows;

    for (var i=0;i<result.length;i++)
    {
        var r=result[i];
      print("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) +
           ", prediction=" + r.get(3));

    }


 $example off$


In [13]:
    sc.stop();
