In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [2]:
sqlContext = SQLContext(sc)
data = sqlContext.read.load('file:///home/cloudera/combined-data-custom4.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [3]:
data.count()

4619

In [4]:
dataAndroid = data.filter(data.platformType=='android')

In [5]:
dataAndroid.count()

1635

In [6]:
featuresUsed = ['count_gameclicks','hits_percent', 'count_buyId']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol="features_unscaled")
assembled = assembler.transform(dataAndroid)

In [7]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

In [8]:
scaledData = scaledData.select("features")
scaledData.persist()

DataFrame[features: vector]

In [9]:
kmeans = KMeans(k=4, seed=1)
model = kmeans.fit(scaledData)
transformed = model.transform(scaledData)

In [10]:
centers = model.clusterCenters()
centers

[array([-0.35624929,  0.79109196,  0.96405876]),
 array([-0.41529147, -0.4799667 , -0.42412705]),
 array([ 0.89450579,  0.06376746, -0.33349948]),
 array([ 3.51560549, -0.08735088, -0.26947378])]