In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install pyspark 

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 54kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 19.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=d3e7983a1ba1e1113d3c58c3f6ccf18fa12a1947a676afe88a117d52bce86680
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Clustering-using K-Means').getOrCreate()
data_customer=spark.read.csv('buddymove_holidayiq.csv', header=True, inferSchema=True)
data_customer.printSchema()

root
 |-- User Id: string (nullable = true)
 |-- Sports: integer (nullable = true)
 |-- Religious: integer (nullable = true)
 |-- Nature: integer (nullable = true)
 |-- Theatre: integer (nullable = true)
 |-- Shopping: integer (nullable = true)
 |-- Picnic: integer (nullable = true)



In [4]:
data_customer.columns

['User Id', 'Sports', 'Religious', 'Nature', 'Theatre', 'Shopping', 'Picnic']

In [5]:
data_customer.toPandas().isna().sum()

User Id      0
Sports       0
Religious    0
Nature       0
Theatre      0
Shopping     0
Picnic       0
dtype: int64

In [7]:
from pyspark.ml.feature import VectorAssembler

feautre_columns=data_customer.columns 

In [14]:
assemble=VectorAssembler(inputCols=[ 'Sports', 'Religious', 'Nature', 'Theatre', 'Shopping', 'Picnic'], outputCol='features')

In [15]:
assembled_data=assemble.transform(data_customer)
assembled_data.show(2)

+-------+------+---------+------+-------+--------+------+--------------------+
|User Id|Sports|Religious|Nature|Theatre|Shopping|Picnic|            features|
+-------+------+---------+------+-------+--------+------+--------------------+
| User 1|     2|       77|    79|     69|      68|    95|[2.0,77.0,79.0,69...|
| User 2|     2|       62|    76|     76|      69|    68|[2.0,62.0,76.0,76...|
+-------+------+---------+------+-------+--------+------+--------------------+
only showing top 2 rows



In [16]:
from pyspark.ml.feature import StandardScaler

scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)
data_scale_output.show(2)


+-------+------+---------+------+-------+--------+------+--------------------+--------------------+
|User Id|Sports|Religious|Nature|Theatre|Shopping|Picnic|            features|        standardized|
+-------+------+---------+------+-------+--------+------+--------------------+--------------------+
| User 1|     2|       77|    79|     69|      68|    95|[2.0,77.0,79.0,69...|[0.30227458951867...|
| User 2|     2|       62|    76|     76|      69|    68|[2.0,62.0,76.0,76...|[0.30227458951867...|
+-------+------+---------+------+-------+--------+------+--------------------+--------------------+
only showing top 2 rows



In [18]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

silhouette_score=[]

In [27]:
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized', metricName='silhouette', distanceMeasure='squaredEuclidean')

for i in range(2,20):
    KMeans_algo=KMeans(featuresCol='standardized', k=i)
    KMeans_fit=KMeans_algo.fit(data_scale_output)
    output=KMeans_fit.transform(data_scale_output)
    score=evaluator.evaluate(output)
    silhouette_score.append(score)
    print("The k means is",i)
    print("Silhouette Score:",score)

The k means is 2
Silhouette Score: 0.5388830964228156
The k means is 3
Silhouette Score: 0.3213140930327642
The k means is 4
Silhouette Score: 0.4234278330413438
The k means is 5
Silhouette Score: 0.40750983438081945
The k means is 6
Silhouette Score: 0.3851829380508911
The k means is 7
Silhouette Score: 0.42994362387035634
The k means is 8
Silhouette Score: 0.3644471862365092
The k means is 9
Silhouette Score: 0.4036006261322713
The k means is 10
Silhouette Score: 0.4267379533129867
The k means is 11
Silhouette Score: 0.4197134089962329
The k means is 12
Silhouette Score: 0.4383436448646076
The k means is 13
Silhouette Score: 0.4459221911988548
The k means is 14
Silhouette Score: 0.4361292728984629
The k means is 15
Silhouette Score: 0.467147832743543
The k means is 16
Silhouette Score: 0.4491479961919911
The k means is 17
Silhouette Score: 0.4270071213955102
The k means is 18
Silhouette Score: 0.4043541778822182
The k means is 19
Silhouette Score: 0.44911737530478774


In [29]:
KMeans_algo=KMeans(featuresCol='standardized', k=2)
KMeans_fit=KMeans_algo.fit(data_scale_output)
prediction=KMeans_fit.transform(data_scale_output)

In [31]:
centers = KMeans_fit.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[2.76169057 4.14795421 3.15138587 4.12239413 3.41169746 4.51462448]
[1.0601357  2.77691451 2.39349064 3.22560805 2.15483464 3.03657325]


In [33]:
output=prediction.toPandas()

In [34]:
output.head()

Unnamed: 0,User Id,Sports,Religious,Nature,Theatre,Shopping,Picnic,features,standardized,prediction
0,User 1,2,77,79,69,68,95,"[2.0, 77.0, 79.0, 69.0, 68.0, 95.0]","[0.3022745895186752, 2.3725805084939395, 1.730...",1
1,User 2,2,62,76,76,69,68,"[2.0, 62.0, 76.0, 76.0, 69.0, 68.0]","[0.3022745895186752, 1.9103895003457696, 1.665...",1
2,User 3,2,50,97,87,50,75,"[2.0, 50.0, 97.0, 87.0, 50.0, 75.0]","[0.3022745895186752, 1.5406366938272336, 2.125...",1
3,User 4,2,68,77,95,76,61,"[2.0, 68.0, 77.0, 95.0, 76.0, 61.0]","[0.3022745895186752, 2.095265903605038, 1.6871...",1
4,User 5,2,98,54,59,95,86,"[2.0, 98.0, 54.0, 59.0, 95.0, 86.0]","[0.3022745895186752, 3.0196479199013777, 1.183...",1


In [35]:
output['prediction'].value_counts()

1    139
0    110
Name: prediction, dtype: int64