In [4]:
pip install spark

Note: you may need to restart the kernel to use updated packages.


In [5]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [6]:
from pyspark import SparkContext
sc =SparkContext.getOrCreate()

In [7]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [8]:
# Load the iris dataset
output_df = spark.read.format("csv").option("header","true").load("/Users/rashidesai/Downloads/iris.csv")

In [9]:
output_df.show(5)

+-----------+----------+-----------+----------+-----------+
|PetalLength|PetalWidth|SepalLength|SepalWidth|    Species|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|         3|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|          5|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [10]:
from pyspark.sql.types import IntegerType
output_df = output_df.withColumn("PetalLength",output_df["PetalLength"].cast(IntegerType()))
output_df.select("PetalLength").dtypes

[('PetalLength', 'int')]

In [11]:
output_df = output_df.withColumn("PetalWidth",output_df["PetalWidth"].cast(IntegerType()))
output_df.select("PetalWidth").dtypes

[('PetalWidth', 'int')]

In [12]:
output_df = output_df.withColumn("SepalLength",output_df["SepalLength"].cast(IntegerType()))
output_df.select("SepalLength").dtypes

[('SepalLength', 'int')]

In [13]:
output_df = output_df.withColumn("SepalWidth",output_df["SepalWidth"].cast(IntegerType()))
output_df.select("SepalWidth").dtypes

[('SepalWidth', 'int')]

In [15]:
output_df.printSchema()

root
 |-- PetalLength: integer (nullable = true)
 |-- PetalWidth: integer (nullable = true)
 |-- SepalLength: integer (nullable = true)
 |-- SepalWidth: integer (nullable = true)
 |-- Species: string (nullable = true)



In [18]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ["PetalLength", "PetalWidth", "SepalLength", "SepalWidth"], outputCol="features") 
# it makes a vector with 4 parameters mentioned in inputCols and name it as outputCol.

irisFeatures = assembler.transform(output_df)

In [19]:
# Creating and training a k-means model
kmeans = KMeans().setK(2).setSeed(1)

In [21]:
model = kmeans.fit(irisFeatures)

In [22]:
# Make predictions
predictions = model.transform(irisFeatures)

#### We are desiring a high value of Silhouette score as it indicates that the point is placed in the correct cluster.

In [23]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with k = 2 is " + str(silhouette))

Silhouette with k = 2 is 0.8293686673914576


In [24]:
# Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.78 2.45 4.46 1.29]
[4.6  3.04 1.   0.  ]


### Trying with different values of K 

In [25]:
# k = 3
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(irisFeatures)

predictions = model.transform(irisFeatures)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette score with k = 3 is " + str(silhouette))

Silhouette score with k = 3 is 0.6972148699719254
