In [1]:
    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');

    var sparkConf = new SparkConf().setAppName("Example");
    var sc = new SparkContext(sparkConf);
    var SQLContext = require('eclairjs/sql/SQLContext');
    var RowFactory = require('eclairjs/sql/RowFactory');
    var StructType = require("eclairjs/sql/types/StructType");
    var StructField = require("eclairjs/sql/types/StructField");
    var DataTypes = require("eclairjs/sql/types/DataTypes");
    var Metadata = require("eclairjs/sql/types/Metadata");
    var Tokenizer = require("eclairjs/ml/feature/Tokenizer");
    var HashingTF = require("eclairjs/ml/feature/HashingTF");
    var LogisticRegression = require("eclairjs/ml/classification/LogisticRegression");
    var Pipeline = require("eclairjs/ml/Pipeline");


In [2]:
    var sqlContext = new SQLContext(sc);

    var schema = new StructType([
        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
        new StructField("text", DataTypes.StringType, false, Metadata.empty()),
        new StructField("label", DataTypes.DoubleType, false, Metadata.empty())
    ]);


 Prepare training documents, which are labeled.


In [3]:
    var training = sqlContext.createDataFrame([
        RowFactory.create(0, "a b c d e spark", 1.0),
        RowFactory.create(1, "b d", 0.0),
        RowFactory.create(2, "spark f g h", 1.0),
        RowFactory.create(3, "hadoop mapreduce", 0.0)
    ], schema);


 Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.


In [4]:
    var tokenizer = new Tokenizer()
        .setInputCol("text")
        .setOutputCol("words");
    var hashingTF = new HashingTF()
        .setNumFeatures(1000)
        .setInputCol(tokenizer.getOutputCol())
        .setOutputCol("features");
    var lr = new LogisticRegression()
        .setMaxIter(10)
        .setRegParam(0.01);
    var pipeline = new Pipeline()
        .setStages([tokenizer, hashingTF, lr]);


 Fit the pipeline to training documents.


In [5]:
    var model = pipeline.fit(training);

    var schema2 = new StructType([
        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
        new StructField("text", DataTypes.StringType, false, Metadata.empty())
    ]);


 Prepare test documents, which are unlabeled.


In [6]:
    var test = sqlContext.createDataFrame([
        RowFactory.create(4, "spark i j k"),
        RowFactory.create(5, "l m n"),
        RowFactory.create(6, "mapreduce spark"),
        RowFactory.create(7, "apache hadoop")
    ], schema2);


 Make predictions on test documents.


In [7]:
    var predictions = model.transform(test);
    var rows = predictions.select("id", "text", "probability", "prediction").collect();


In [8]:
var result = rows;
    result.forEach(function (r) {
        print("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
            + ", prediction=" + r.get(3));
    });

    sc.stop();
