In [1]:
    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');

    var sparkConf = new SparkConf().setAppName("Example");
    var sc = new SparkContext(sparkConf);
    var SQLContext = require('eclairjs/sql/SQLContext');
    var StringIndexer = require("eclairjs/ml/feature/StringIndexer");
    var VectorIndexer = require("eclairjs/ml/feature/VectorIndexer");
    var IndexToString = require("eclairjs/ml/feature/IndexToString");
    var RandomForestClassifier = require("eclairjs/ml/classification/RandomForestClassifier");
    var MulticlassClassificationEvaluator = require("eclairjs/ml/evaluation/MulticlassClassificationEvaluator");
    var Pipeline = require("eclairjs/ml/Pipeline");


In [2]:
    var sqlContext = new SQLContext(sc);

    var data =
        sqlContext.read().format("libsvm").load("examples/data/mllib/sample_libsvm_data.txt");


 Index labels, adding metadata to the label column.
 Fit on whole dataset to include all labels in index.


In [3]:
    var labelIndexer = new StringIndexer()
        .setInputCol("label")
        .setOutputCol("indexedLabel")
        .fit(data);


 Automatically identify categorical features, and index them.
 Set maxCategories so features with > 4 distinct values are treated as continuous.


In [4]:
    var featureIndexer = new VectorIndexer()
        .setInputCol("features")
        .setOutputCol("indexedFeatures")
        .setMaxCategories(4)
        .fit(data);


 Split the data into training and test sets (30% held out for testing)


In [5]:
    var splits = data.randomSplit([0.7, 0.3]);
    var trainingData = splits[0];
    var testData = splits[1];


 Train a RandomForest model.


In [6]:
    var rf = new RandomForestClassifier()
        .setLabelCol("indexedLabel")
        .setFeaturesCol("indexedFeatures");


 Convert indexed labels back to original labels.


In [7]:
    var labelConverter = new IndexToString()
        .setInputCol("prediction")
        .setOutputCol("predictedLabel")
        .setLabels(labelIndexer.labels());


 Chain indexers and forest in a Pipeline


In [8]:
    var pipeline = new Pipeline()
        .setStages([labelIndexer, featureIndexer, rf, labelConverter]);


 Train model. This also runs the indexers.


In [9]:
    var model = pipeline.fit(trainingData);


 Make predictions.


In [10]:
    var predictions = model.transform(testData);

    var ret = {};


 Select example rows to display.


In [11]:
    ret.predictionsDF = predictions.select("predictedLabel", "label", "features");


 Select (prediction, true label) and compute test error


In [12]:
    var evaluator = new MulticlassClassificationEvaluator()
        .setLabelCol("indexedLabel")
        .setPredictionCol("prediction")
        .setMetricName("precision");
    var accuracy = evaluator.evaluate(predictions);

    ret.accuracy = 1.0 - accuracy;
    ret.model = model.stages()[2];

var result = ret;
    result.predictionsDF.show(5);
    print("Test Error = " + result.accuracy);
    print("Learned classification forest model:\n" + result.model.toDebugString());

    sc.stop();
