In [1]:

    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');
    var sparkConf = new SparkConf().setAppName("JavaScript Decision Tree Classification Example");
    var sc = new SparkContext(sparkConf);

    var SQLContext = require('eclairjs/sql/SQLContext');
    var StringIndexer = require('eclairjs/ml/feature/StringIndexer');
    var IndexToString = require('eclairjs/ml/feature/IndexToString');
    var VectorIndexer = require('eclairjs/ml/feature/VectorIndexer');
    var DecisionTreeClassifier = require('eclairjs/ml/classification/DecisionTreeClassifier');
    var MulticlassClassificationEvaluator = require('eclairjs/ml/evaluation/MulticlassClassificationEvaluator');
    var Pipeline = require('eclairjs/ml/Pipeline');

    var sqlContext = new SQLContext(sc);


 Load the data stored in LIBSVM format as a DataFrame.


In [2]:
    var data = sqlContext
        .read()
        .format("libsvm")
        .load("examples/data/mllib/sample_libsvm_data.txt");


 Index labels, adding metadata to the label column.
 Fit on whole dataset to include all labels in index.


In [3]:
    var labelIndexer = new StringIndexer()
        .setInputCol("label")
        .setOutputCol("indexedLabel")
        .fit(data);


 Automatically identify categorical features, and index them.


In [4]:
    var featureIndexer = new VectorIndexer()
        .setInputCol("features")
        .setOutputCol("indexedFeatures")
        .setMaxCategories(4) // features with > 4 distinct values are treated as continuous
        .fit(data);


 Split the data into training and test sets (30% held out for testing)


In [5]:
    var splits = data.randomSplit([0.7, 0.3]);
    var trainingData = splits[0];
    var testData = splits[1];


 Train a DecisionTree model.


In [6]:
    var dt = new DecisionTreeClassifier()
        .setLabelCol("indexedLabel")
        .setFeaturesCol("indexedFeatures");


 Convert indexed labels back to original labels.


In [7]:
    var labelConverter = new IndexToString()
        .setInputCol("prediction")
        .setOutputCol("predictedLabel")
        .setLabels(labelIndexer.labels());


 Chain indexers and tree in a Pipeline


In [8]:
    var pipeline = new Pipeline()
        .setStages([labelIndexer, featureIndexer, dt, labelConverter]);


 Train model.  This also runs the indexers.


In [9]:
    var model = pipeline.fit(trainingData);


 Make predictions.


In [10]:
    var predictions = model.transform(testData);

    var ret = {};
    ret.predictions = predictions;


 Select (prediction, true label) and compute test error


In [11]:
    var evaluator = new MulticlassClassificationEvaluator()
        .setLabelCol("indexedLabel")
        .setPredictionCol("prediction")
        .setMetricName("precision");
    ret.accuracy = evaluator.evaluate(predictions);
    ret.treeModel = model.stages()[2];
var result = ret;


 Select example rows to display.


In [12]:
    result.predictions.select("predictedLabel", "label", "features").show(5);
    print("Test Error = " + (1.0 - result.accuracy));
    print("Learned classification tree model:\n" + result.treeModel.toDebugString());;
    sc.stop();
