# Clustering 

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('241213_01_MLlib_Clustering').getOrCreate()

24/12/13 13:39:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## 데이터 준비

In [4]:
data = [
    (0, 0, 4.0),
    (0, 1, 2.0),
    (1, 1, 3.0),
    (1, 2, 1.0),
    (2, 0, 5.0),
    (2, 2, 4.0)
]

columns = ["user_id","item_id","rating"]

In [5]:
rating_df= spark.createDataFrame(data, columns)
rating_df.show()

                                                                                

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|      0|      0|   4.0|
|      0|      1|   2.0|
|      1|      1|   3.0|
|      1|      2|   1.0|
|      2|      0|   5.0|
|      2|      2|   4.0|
+-------+-------+------+



## 전처리

In [11]:
# user_id, item_id matrix 생성

user_item_matrix = rating_df.groupBy("user_id").pivot("item_id").avg('rating').fillna(0)
user_item_matrix.show()

                                                                                

+-------+---+---+---+
|user_id|  0|  1|  2|
+-------+---+---+---+
|      0|4.0|2.0|0.0|
|      1|0.0|3.0|1.0|
|      2|5.0|0.0|4.0|
+-------+---+---+---+



## Feature Vector

In [12]:
from pyspark.ml.feature import VectorAssembler

In [30]:
#벡터 생성
assembler = VectorAssembler(inputCols = ["0","1","2"], outputCol = "features" )

In [31]:
user_features= assembler.transform(user_item_matrix)

In [32]:
user_features.show()

                                                                                

+-------+---+---+---+-------------+
|user_id|  0|  1|  2|     features|
+-------+---+---+---+-------------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|
+-------+---+---+---+-------------+



## Model 생성

In [36]:
from pyspark.ml.clustering import KMeans

#모델 생성
#k = cluster 개수
kmeans = KMeans(k=2, seed =1, featuresCol = "features", predictionCol = "cluster")

#모델 학습
model = kmeans.fit(user_features)

                                                                                

In [37]:
#예측 
clusters = model.transform(user_features)

In [38]:
clusters.show()

+-------+---+---+---+-------------+-------+
|user_id|  0|  1|  2|     features|cluster|
+-------+---+---+---+-------------+-------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|      0|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|      0|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|      1|
+-------+---+---+---+-------------+-------+



In [40]:
spark.stop()