In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

spark = SparkSession.builder.appName('Test Spark - ML Algorithms').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Loads data.
dataset = spark.read.format("libsvm").load("./sample_kmeans_data.txt")

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.9997530305375207
Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [5]:
predictions.show(truncate=False)

+-----+-------------------------+----------+
|label|features                 |prediction|
+-----+-------------------------+----------+
|0.0  |(3,[],[])                |1         |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|1         |
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|1         |
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|0         |
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|0         |
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|0         |
+-----+-------------------------+----------+

