In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('GBT').getOrCreate()

In [3]:
data = spark.read.format("libsvm").load("sample_data.txt")


(trainingData, testData) = data.randomSplit([0.7, 0.3])

gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

model = gbt.fit(trainingData)

predictions = model.transform(testData)

predictions.select("prediction", "label", "features").show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[129,130,131...|
|       0.0|  0.0|(692,[150,151,152...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.0|  0.0|(692,[153,154,155...|
|       1.0|  0.0|(692,[154,155,156...|
|       0.0|  0.0|(692,[155,156,180...|
|       0.0|  0.0|(692,[181,182,183...|
|       0.0|  1.0|(692,[99,100,101,...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 20 rows



In [4]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Model Accuracy = %g" % accuracy)

Model Accuracy = 0.909091
