In [None]:
import findspark
findspark.init()
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv('cc_data_preprocessed.csv',header=True,inferSchema=True)

In [None]:
df.printSchema()

root
 |-- CUST_ID: string (nullable = true)
 |-- BALANCE: double (nullable = true)
 |-- BALANCE_FREQUENCY: double (nullable = true)
 |-- PURCHASES: double (nullable = true)
 |-- ONEOFF_PURCHASES: double (nullable = true)
 |-- INSTALLMENTS_PURCHASES: double (nullable = true)
 |-- CASH_ADVANCE: double (nullable = true)
 |-- PURCHASES_FREQUENCY: double (nullable = true)
 |-- ONEOFF_PURCHASES_FREQUENCY: double (nullable = true)
 |-- PURCHASES_INSTALLMENTS_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_TRX: double (nullable = true)
 |-- PURCHASES_TRX: double (nullable = true)
 |-- CREDIT_LIMIT: double (nullable = true)
 |-- PAYMENTS: double (nullable = true)
 |-- MINIMUM_PAYMENTS: double (nullable = true)
 |-- PRC_FULL_PAYMENT: double (nullable = true)
 |-- TENURE: integer (nullable = true)



In [None]:
df = df.drop('CUST_ID')

In [None]:
#split data into train, test
trainDF, testDF = df.randomSplit([0.8,0.2], seed=42)

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
vecAssembler = VectorAssembler(inputCols=df.columns, outputCol='features')

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
kmeans = KMeans(k=7, featuresCol= 'features', predictionCol='prediction', seed = 1)

In [None]:
myStages = [vecAssembler, kmeans]

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=myStages)

In [None]:
pipelineModel = pipeline.fit(trainDF)

In [None]:
predDF_train = pipelineModel.transform(trainDF)

In [None]:
predDF_test = pipelineModel.transform(testDF)

In [None]:
predDF_train.printSchema()

root
 |-- BALANCE: double (nullable = true)
 |-- BALANCE_FREQUENCY: double (nullable = true)
 |-- PURCHASES: double (nullable = true)
 |-- ONEOFF_PURCHASES: double (nullable = true)
 |-- INSTALLMENTS_PURCHASES: double (nullable = true)
 |-- CASH_ADVANCE: double (nullable = true)
 |-- PURCHASES_FREQUENCY: double (nullable = true)
 |-- ONEOFF_PURCHASES_FREQUENCY: double (nullable = true)
 |-- PURCHASES_INSTALLMENTS_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_TRX: double (nullable = true)
 |-- PURCHASES_TRX: double (nullable = true)
 |-- CREDIT_LIMIT: double (nullable = true)
 |-- PAYMENTS: double (nullable = true)
 |-- MINIMUM_PAYMENTS: double (nullable = true)
 |-- PRC_FULL_PAYMENT: double (nullable = true)
 |-- TENURE: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
silscore = ClusteringEvaluator(featuresCol='features', predictionCol='prediction')

In [None]:
silscore.evaluate(predDF_train)

0.5581551956984339

In [None]:
silscore.evaluate(predDF_test)

0.5456400212584801