In [1]:
import pyspark
sc = pyspark.SparkContext()
from pyspark.sql import SQLContext
sql = SQLContext(sc)

In [2]:
###Random forest regression

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = sql.read.format("libsvm").load("sample_libsvm_data.txt")

#print type(data)
#print data.head(2)


# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
    
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show()

print predictions.select("prediction", "label", "features").head(3)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.1|  0.0|(692,[125,126,127...|
|       0.2|  0.0|(692,[126,127,128...|
|      0.15|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.1|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[128,129,130...|
|      0.05|  0.0|(692,[152,153,154...|
|      0.05|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.4|  0.0|(692,[154,155,156...|
|       0.5|  0.0|(692,[154,155,156...|
|      0.15|  0.0|(692,[234,235,237...|
|       1.0|  1.0|(692,[125,126,127...|
|       1.0|  1.0|(692,[127,128,155...|
|       1.0|  1.0|(692,[128,129,130...|
|       1.0|  1.0|(692,[128,129,130...|
+----------+-----+--------------------+
only showing top 20 rows

[Row(predictio