In [1]:
    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');

    var sparkConf = new SparkConf().setAppName("Example");
    var sc = new SparkContext(sparkConf);
    var SQLContext = require('eclairjs/sql/SQLContext');
    var VectorIndexer = require("eclairjs/ml/feature/VectorIndexer");
   var RandomForestRegressor = require("eclairjs/ml/regression/RandomForestRegressor");
   var RegressionEvaluator = require("eclairjs/ml/evaluation/RegressionEvaluator");
    var Pipeline = require("eclairjs/ml/Pipeline");


In [2]:
    var sqlContext = new SQLContext(sc);


 Load and parse the data file, converting it to a DataFrame.


In [3]:
    var data =
        sqlContext.read().format("libsvm").load("examples/data/mllib/sample_libsvm_data.txt");


 Automatically identify categorical features, and index them.
 Set maxCategories so features with > 4 distinct values are treated as continuous.


In [4]:
    var featureIndexer = new VectorIndexer()
        .setInputCol("features")
        .setOutputCol("indexedFeatures")
        .setMaxCategories(4)
        .fit(data);


 Split the data into training and test sets (30% held out for testing)


In [5]:
    var splits = data.randomSplit([0.7, 0.3]);
    var trainingData = splits[0];
    var testData = splits[1];


 Train a RandomForest model.


In [6]:
    var rf = new RandomForestRegressor()
        .setLabelCol("label")
        .setFeaturesCol("indexedFeatures");


 Chain indexer and forest in a Pipeline


In [7]:
    var pipeline = new Pipeline()
        .setStages([featureIndexer, rf]);


 Train model.  This also runs the indexer.


In [8]:
    var model = pipeline.fit(trainingData);


 Make predictions.


In [9]:
    var predictions = model.transform(testData);

    var ret = {};


 Select example rows to display.


In [10]:
    ret.predictionsDF =  predictions.select("prediction", "label", "features");


 Select (prediction, true label) and compute test error


In [11]:
    var evaluator = new RegressionEvaluator()
        .setLabelCol("label")
        .setPredictionCol("prediction")
        .setMetricName("rmse");
    ret.rmse = evaluator.evaluate(predictions);

    ret.rfModel = model.stages()[1];

var result = ret;
    result.predictionsDF.show(5);
    print("Root Mean Squared Error (RMSE) on test data = " + result.rmse);
    print("Learned regression forest model:\n" + result.rfModel.toDebugString());

    sc.stop();
