In [1]:
import pyspark
import pyspark.sql.functions as F

In [55]:
# 로컬 모드로 이 코드를 실행하려면 로컬 모드에 적합한 셔플 파티션 수를 설정해 주는 것이 좋다.
# 기본 200이지만, 로컬 모드에서는 많은 익스큐터가 필요하지 않기 때문에 5로 설정한다.
spark.conf.set("spark.sql.shuffle.partitions", "5")

staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../assets/exercises/week03/all/online-retail-dataset.csv")

## 학습 데이터셋 준비

In [56]:
# 날짜 데이터 변환
preppedDataFrame = staticDataFrame\
.na.fill(0)\
.withColumn("day_of_week", F.date_format(F.to_timestamp(F.col("InvoiceDate"), "M/d/yyyy H:mm"), "EEEE"))\
.coalesce(5)

In [57]:
preppedDataFrame.take(3)

[Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom', day_of_week='Wednesday'),
 Row(InvoiceNo='536365', StockCode='71053', Description='WHITE METAL LANTERN', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=3.39, CustomerID=17850, Country='United Kingdom', day_of_week='Wednesday'),
 Row(InvoiceNo='536365', StockCode='84406B', Description='CREAM CUPID HEARTS COAT HANGER', Quantity=8, InvoiceDate='12/1/2010 8:26', UnitPrice=2.75, CustomerID=17850, Country='United Kingdom', day_of_week='Wednesday')]

In [58]:
preppedDataFrame\
.withColumn("date", F.to_date(F.to_timestamp(F.col("InvoiceDate"), "M/d/yyyy H:mm")))\
.select(F.max("date")).take(1)

[Row(max(date)=datetime.date(2011, 12, 9))]

In [59]:
preppedDataFrame\
.withColumn("date", F.to_date(F.to_timestamp(F.col("InvoiceDate"), "M/d/yyyy H:mm")))\
.select(F.min("date")).take(1)

[Row(min(date)=datetime.date(2010, 12, 1))]

In [60]:
# 학습 데이터셋과 테스트 데이터셋 분리
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-11-05'")
       
testDataFrame = preppedDataFrame\
       .where("InvoiceDate >= '2011-11-05'")

In [61]:
print(f'train Dataset: {trainDataFrame.count()}')
print(f'test Dataset: {testDataFrame.count()}')

train Dataset: 276313
test Dataset: 265596


In [62]:
# MLlib은 일번적인 트랜스포이션을 자동화하는 다양한 트랜스포메이션 제공
# 요일을 수치형으로 반환
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")

In [63]:
# onehotencoding
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [64]:
# 벡터화

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

In [65]:
# 변환자를 사용해서 데이터셋에 변환자를 fit 시킨다.
from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])

fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [66]:
# 위의 과정에서 학습을 위한 맞춤 파이프라인이 준비되면, 이것을 사용해서 일관되고 반복적인 방식으로 모든 데이터 변환이 가능하다.
transformedTraning = fittedPipeline.transform(trainDataFrame)

## 모델 학습
1. 아직 학습되지 않은 모델을 초기화
2. 해당 모델을 학습

In [44]:
# https://spark.apache.org/docs/latest/ml-clustering.html
# 캐싱
transformedTraning.cache()

# 모델 학습
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(20).setSeed(1)
kmModel = kmeans.fit(transformedTraning)

In [50]:
kmModel

KMeansModel: uid=KMeans_7aed4674187c, k=20, distanceMeasure=euclidean, numFeatures=7

## 모델 평가

In [68]:
transformedTest = fittedPipeline.transform(testDataFrame)
# kmModel.computeCost(transformedTest)
# ---------------------------------------------------------------------------
# AttributeError                            Traceback (most recent call last)
# <ipython-input-49-1a60a70baffb> in <module>
#       1 transformedTest = fittedPipeline.transform(testDataFrame)
# ----> 2 kmModel.computeCost(transformedTest)

# AttributeError: 'KMeansModel' object has no attribute 'computeCost'

####### ####### ####### #######
# "Deprecated in 3.0.0. It will be removed in future versions. Use "
#             "ClusteringEvaluator instead. You can also get the cost on the training "
#             "dataset in the summary."
####### ####### ####### #######


from pyspark.ml.evaluation import ClusteringEvaluator

pdt = kmModel.transform(transformedTest)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(pdt)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9570197876949359
