In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
        .master('spark://spark-master:7077') \
        .appName('ml') \
        .getOrCreate()

dataset from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html
- note the directory needs to be accessible from the notebook server and the workers

In [9]:
dataset = spark.read.format("libsvm").load("/opt/spark-data/iris.scale")

In [14]:
type(dataset)

pyspark.sql.dataframe.DataFrame

In [12]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,2,3],[-0.77...|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-1....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
+-----+--------------------+
only showing top 20 rows



In [19]:
# split dataset into train and test
train, test = dataset.randomSplit([0.8,0.2])

In [31]:
print("train rows {0} test rows {1}".format(train.count(), test.count()))

train rows 113 test rows 37


In [32]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
logistic_regression = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
ovr = OneVsRest(classifier=logistic_regression)

In [36]:
ovrModel = ovr.fit(train)

In [37]:
predictions = ovrModel.transform(test)

In [38]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.027027


# Hyper Parameter Tuning

In [39]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [51]:
# build a param grid

In [43]:
paramGrid = ParamGridBuilder() \
    .addGrid(logistic_regression.regParam, [0.1, 0.01]) \
    .build()

In [44]:
crossval = CrossValidator(estimator=ovr,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName="accuracy"),
                          numFolds=2) 

In [46]:
cvModel = crossval.fit(train)

In [47]:
prediction = cvModel.transform(test)

In [50]:
cvModel

CrossValidatorModel_ccbd18c55554