# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("KMeans Cluster Model").getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans, LDA, BisectingKMeans, GaussianMixture, PowerIterationClustering
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator

# Load and verify data

In [3]:
data = spark.read.csv('seeds_dataset.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
data.head(3)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22),
 Row(area=14.88, perimeter=14.57, compactness=0.8811, length_of_kernel=5.553999999999999, width_of_kernel=3.333, asymmetry_coefficient=1.018, length_of_groove=4.956),
 Row(area=14.29, perimeter=14.09, compactness=0.905, length_of_kernel=5.291, width_of_kernel=3.3369999999999997, asymmetry_coefficient=2.699, length_of_groove=4.825)]

In [6]:
for item in data.head(1)[0]:
    print(item)

15.26
14.84
0.871
5.763
3.312
2.221
5.22


In [7]:
data.describe().show()

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|
| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|
|    min|             10.59|             12.41|              0.8081|              4.899|            

In [8]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

# Data Preprocessing

In [9]:
assembler = VectorAssembler(inputCols =data.columns,outputCol='features')
final_data = assembler.transform(data)

In [10]:
scaler = StandardScaler(inputCol='features',outputCol='scaledfeatures')
final_data = scaler.fit(final_data).transform(final_data)
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledfeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

# Train and Test data

In [11]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [12]:
train_data.show(2)

+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledfeatures|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
|10.59|    12.41|     0.8648|           4.899|          2.787|                4.975|           4.794|[10.59,12.41,0.86...|[3.63955118123602...|
|10.74|    12.73|     0.8329|           5.145|          2.642|                4.702|           4.963|[10.74,12.73,0.83...|[3.69110289768413...|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
only showing top 2 rows



In [13]:
test_data.show(2)

+-----+---------+-----------+----------------+---------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledfeatures|
+-----+---------+-----------+----------------+---------------+---------------------+------------------+--------------------+--------------------+
|10.79|    12.93|     0.8107|           5.317|          2.648|    5.462000000000001|             5.194|[10.79,12.93,0.81...|[3.70828680316683...|
|10.82|    12.83|     0.8256|            5.18|           2.63|                4.853|5.0889999999999995|[10.82,12.83,0.82...|[3.71859714645645...|
+-----+---------+-----------+----------------+---------------+---------------------+------------------+--------------------+--------------------+
only showing top 2 rows



# Build and Evaluate Model 

### K-means

In [14]:
classifier = KMeans(k=2,featuresCol='scaledfeatures')
model = classifier.fit(train_data)

In [15]:
# Make predictions
predictions = model.transform(test_data)

In [16]:
# Evaluate clustering by computing Silhouette score
print(ClusteringEvaluator().evaluate(predictions))

0.7168901012226508


In [17]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[ 6.07778107 12.12679731 37.42676327 13.62169192  9.57951868  2.25435015
 11.87888827]
[ 4.35285703 10.4043414  36.38447091 11.98785792  7.88634525  2.648241
 10.33561417]


In [18]:
predictions.show(3)

+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+----------+
| area|perimeter|       compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledfeatures|prediction|
+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+----------+
|10.79|    12.93|            0.8107|           5.317|          2.648|    5.462000000000001|             5.194|[10.79,12.93,0.81...|[3.70828680316683...|         1|
|10.82|    12.83|            0.8256|            5.18|           2.63|                4.853|5.0889999999999995|[10.82,12.83,0.82...|[3.71859714645645...|         1|
|10.93|     12.8|0.8390000000000001|           5.046|          2.717|                5.398|             5.045|[10.93,12.8,0.839...|[3.75640173851839...|         1|
+-----+---------

### LDA Model

In [19]:
# Build and Train Model
classifier = LDA(k=10, maxIter=10)
model = classifier.fit(train_data)
ll = model.logLikelihood(train_data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))

The lower bound on the log likelihood of the entire corpus: -12983.336177573194


In [20]:
lp = model.logPerplexity(train_data)
print("The upper bound on perplexity: " + str(lp))

The upper bound on perplexity: 1.7880510707450954


In [21]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[6, 3, 1]  |[0.1576947352898697, 0.1458773093660993, 0.14187695448498366]  |
|1    |[0, 2, 1]  |[0.16034343322698477, 0.15727103756293, 0.14469432194691803]   |
|2    |[5, 6, 3]  |[0.15774365475755073, 0.14844202089911707, 0.14456670522897427]|
|3    |[5, 3, 4]  |[0.15443179907826954, 0.1531448467053669, 0.15184239179404682] |
|4    |[0, 3, 5]  |[0.17107361749489125, 0.1616701989849892, 0.15071309176215092] |
|5    |[5, 2, 1]  |[0.16857747134947987, 0.149966958911465, 0.14589756820545954]  |
|6    |[5, 6, 4]  |[0.1630092932292176, 0.15521112245299357, 0.1505916844550948]  |
|7    |[6, 3, 0]  |[0.1737564327358855, 0.14753322053325083, 0.14133696674110088] |
|8    |[2, 4, 3]  |[0.1612

In [22]:
# Make predictions
predictions = model.transform(test_data)

In [23]:
predictions.show(3)

+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+--------------------+
| area|perimeter|       compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledfeatures|   topicDistribution|
+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+--------------------+
|10.79|    12.93|            0.8107|           5.317|          2.648|    5.462000000000001|             5.194|[10.79,12.93,0.81...|[3.70828680316683...|[0.00197952470464...|
|10.82|    12.83|            0.8256|            5.18|           2.63|                4.853|5.0889999999999995|[10.82,12.83,0.82...|[3.71859714645645...|[0.00202192477186...|
|10.93|     12.8|0.8390000000000001|           5.046|          2.717|                5.398|             5.045|[10.93,12.8,0.839...

### Bisecting k-means

In [24]:
# Build and Train Model
classifier = BisectingKMeans().setK(2).setSeed(1)
model = classifier.fit(train_data)

In [25]:
# Make predictions
predictions = model.transform(test_data)
# Evaluate clustering by computing Silhouette score
print(ClusteringEvaluator().evaluate(predictions))

0.6910492756132363


In [26]:
# Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[12.55529412 13.53458824  0.85921294  5.29488235  2.96549412  4.05595294
  5.07103529]
[17.52227273 15.76909091  0.88356364  6.01265152  3.59668182  3.32992424
  5.8035    ]


In [27]:
predictions.show(3)

+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+----------+
| area|perimeter|       compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledfeatures|prediction|
+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+----------+
|10.79|    12.93|            0.8107|           5.317|          2.648|    5.462000000000001|             5.194|[10.79,12.93,0.81...|[3.70828680316683...|         0|
|10.82|    12.83|            0.8256|            5.18|           2.63|                4.853|5.0889999999999995|[10.82,12.83,0.82...|[3.71859714645645...|         0|
|10.93|     12.8|0.8390000000000001|           5.046|          2.717|                5.398|             5.045|[10.93,12.8,0.839...|[3.75640173851839...|         0|
+-----+---------

### Gaussian Mixture Model (GMM)

In [28]:
# Build and Train Model
classifier = GaussianMixture().setK(2).setSeed(538009335)
model = classifier.fit(train_data)

In [29]:
# Make predictions
predictions = model.transform(test_data)
# Evaluate clustering by computing Silhouette score
print(ClusteringEvaluator().evaluate(predictions))

0.6169943751951565


In [30]:
predictions.show(3)

+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+--------------------+----------+
| area|perimeter|       compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledfeatures|         probability|prediction|
+-----+---------+------------------+----------------+---------------+---------------------+------------------+--------------------+--------------------+--------------------+----------+
|10.79|    12.93|            0.8107|           5.317|          2.648|    5.462000000000001|             5.194|[10.79,12.93,0.81...|[3.70828680316683...|[1.0,5.8306706611...|         0|
|10.82|    12.83|            0.8256|            5.18|           2.63|                4.853|5.0889999999999995|[10.82,12.83,0.82...|[3.71859714645645...|[1.0,1.4456517535...|         0|
|10.93|     12.8|0.8390000000000001|           5.046|          2.717|      

### Power Iteration Clustering (PIC)

In [31]:
df = spark.createDataFrame([
    (0, 1, 1.0),
    (0, 2, 1.0),
    (1, 2, 1.0),
    (3, 4, 1.0),
    (4, 0, 0.1)
], ["src", "dst", "weight"])

In [32]:
# Build and Train Model
classifier = PowerIterationClustering(k=2, maxIter=20, initMode="degree", weightCol="weight")
# Shows the cluster assignment
classifier.assignClusters(df).show()

+---+-------+
| id|cluster|
+---+-------+
|  0|      0|
|  1|      0|
|  2|      0|
|  3|      1|
|  4|      1|
+---+-------+

