In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [2]:
sqlContext = SQLContext(sc)
data = sqlContext.read.load('file:///home/cloudera/combined-data-custom4.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [3]:
data.count()

4619

In [4]:
dataAndroid=data.filter(data.platformType=='android')

In [5]:
dataAndroid.count()

1635

In [8]:
dataAndroid

DataFrame[userId: int, userSessionId: int, teamLevel: int, platformType: string, count_gameclicks: int, gameclicks_category: string, count_hits: int, hits_percent: int, hits_percent_category: string, count_buyId: int, avg_price: double, total_buy: double, buy_category: string]

In [9]:
dataAndroid.show(5)

+------+-------------+---------+------------+----------------+-------------------+----------+------------+---------------------+-----------+---------+---------+---------------+
|userId|userSessionId|teamLevel|platformType|count_gameclicks|gameclicks_category|count_hits|hits_percent|hits_percent_category|count_buyId|avg_price|total_buy|   buy_category|
+------+-------------+---------+------------+----------------+-------------------+----------+------------+---------------------+-----------+---------+---------+---------------+
|   812|         5648|        1|     android|              69|                Low|         8|          12|               Medium|          0|      0.0|      0.0|    NonSpenders|
|  1863|         5651|        1|     android|              35|                Low|         4|          11|               Medium|          0|      0.0|      0.0|    NonSpenders|
|   937|         5652|        1|     android|              39|                Low|         0|           0|         

In [11]:
dataAndroid_cluster=dataAndroid['count_gameclicks','count_hits', 'count_buyId']

In [12]:
dataAndroid_cluster

DataFrame[count_gameclicks: int, count_hits: int, count_buyId: int]

In [14]:
dataAndroid_cluster.show(5)

+----------------+----------+-----------+
|count_gameclicks|count_hits|count_buyId|
+----------------+----------+-----------+
|              69|         8|          0|
|              35|         4|          0|
|              39|         0|          1|
|              36|         5|          0|
|              68|         6|          0|
+----------------+----------+-----------+
only showing top 5 rows



In [21]:
dataAndroid_cluster.count()

1635

In [22]:
featuresUsed = ['count_gameclicks','count_hits', 'count_buyId']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol="features_unscaled")
assembled = assembler.transform(dataAndroid_cluster)

In [23]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

In [24]:
scaledData = scaledData.select("features")
scaledData.persist()

DataFrame[features: vector]

In [26]:
kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(scaledData)
transformed = model.transform(scaledData)

In [27]:
centers = model.clusterCenters()
centers

[array([-0.3138125 , -0.316014  , -0.34749399]),
 array([-0.2368066 , -0.20691153,  2.14100964]),
 array([ 2.1939005 ,  2.17502569, -0.18019905])]