In [32]:
#Loading Libraries

#import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean, col, split, col, regexp_extract, when, lit
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler, VectorIndexer
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoder

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from datetime import datetime
from dateutil import parser
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [33]:
spark = SparkSession.builder.appName("Spark ML applied on Churn POC dataset").getOrCreate()

In [34]:
#Get Data
# Primary storage info
#account_name = 'synapseaiadadls'# primary ADLS account name https://synapseaiadadls.dfs.core.windows.net.
#container_name = 'root' # Primary ADLS Gen2 file system from Synapse Home Page
#relative_path = 'Raw' # relative folder path
#filename = 'pocdata.csv'
#poc_data_path = 'abfss://%s@%s.dfs.core.windows.net/%s/%s' % (container_name, account_name, relative_path, filename)
poc_data_path = 'abfss://synapsedatalake@synapseaiadadls.dfs.core.windows.net/pocdata.csv'

poc_df = spark.read.csv(poc_data_path, header = 'True', inferSchema = 'True')

In [35]:
poc_df.printSchema()

In [36]:
poc_df.show(5)

In [37]:
spark.sql("CREATE DATABASE IF NOT EXISTS devtest")
poc_df.write.mode("overwrite").saveAsTable("devtest.Mlchurndata")

In [38]:
#Number of cutomers in the dataframe

customers_count = poc_df.count()
print("Number of cutomers is {}".format(customers_count))

In [39]:
groupBy_customers = poc_df.groupBy("churn").count()

groupBy_customers.show()

In [40]:
display(groupBy_customers)

In [41]:
#Summary statistics for the numeric variables
poc_df.describe([t[0] for t in poc_df.dtypes if t[1] == 'int']).show()

In [42]:
display(poc_df.groupBy("segment").count())

In [43]:
#Process Data
def get_dummy(df, categoricalCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
                             outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]

  assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [assembler] + [indexer])

  model=pipeline.fit(df)
  data = model.transform(df)

  data = data.withColumn('label', col(labelCol))
  
  return data.select('features', 'indexedLabel', 'label'), StringIndexer(inputCol='label').fit(data)

In [44]:
# Transform Data: Deal with categorical data and Convert the data to dense vector
categoricalColumns = ['Gender', 'Age_Band', 'Tenure', 'Education', 'Marital_Status', 'Segment', 'Occupation', 'F2', 'Recency', 'Freq_M1', 'Freq_M2', 'F3']
numericColumns = ['No_of_Accounts', 'GDP_in_Billions_of_USSD', 'Inflation', 'Population', 'txn_amount_M1', 'txn_vol_M1', 'txn_amount_M2', 'txn_vol_M2', 'txn_amount_M3', 'txn_vol_M3', 'F1']
(poc_df, labelindexer) = get_dummy(poc_df, categoricalColumns, numericColumns, 'CHURN')
poc_df.show(5)

In [45]:
#Deal with Categorical Label and Variables
# Identify Categorical features and index them
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(poc_df)

featureIndexer.transform(poc_df).show(5)

In [46]:
#show
poc_df.show(5, False)

In [47]:
# Create Split and Training Datasets
(trainingData, testData) = poc_df.randomSplit([0.8, 0.2], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [48]:
#show
print("The first 5 samples of the Training Dataset:")
trainingData.show(5, False)
print("The first 5 samples of the Test Dataset:")
testData.show(5, False)

In [49]:
#Fit & Evaluate Logistic Regression
#lr = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures") #using this line if you would using indexedFeatures instead features
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features")

In [50]:
#Pipeline
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelindexer.labels) 

pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter])

lrModel = pipeline.fit(trainingData)

In [51]:
#Predict: Make predictions on the test data using the transform() method. 
#LogisticRegression.transform() will only use the column given in featuresCol parameter. 
predictions = lrModel.transform(testData)

predictions.show(5)

In [52]:
predictions.select("features", "label", "probability", "predictedLabel").show(5)

In [53]:
#Compute the model accuracy
cm = predictions.select("label", "predictedLabel")			
cm.groupby('label').agg({'label': 'count'}).show()	
cm.groupby('predictedLabel').agg({'predictedLabel': 'count'}).show()

In [54]:
predictions.groupBy('label', 'predictedLabel').count().show()

In [55]:
print("The Accuracy for test set is {}".format(cm.filter(cm.label == cm.predictedLabel).count()/cm.count()))


In [56]:
#Using the MulticlassClassificationEvaluator() function:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

In [57]:
#Evaluation
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [60]:
#Create Confusion Matrix

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve 
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

In [61]:
#Compute the area under ROC metric
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))

In [63]:
#Model Tuning
print(lr.explainParams())

In [64]:
#Hyperparameter tuning using 5-fold cross validation
#we indicate 3 values for regParam, 3 values for maxIter, and 3 values for elasticNetParam, this grid will have then 3 x 3 x 3 = 27 parameter settings for CrossValidator to choose from. 
#We will create a 5-fold CrossValidator.

#Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")

In [65]:
#Create and run 5-fold CrossValidator
#cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
#pipeline = Pipeline(stages=[featureIndexer, cv, labelConverter])
#cvModel = pipeline.fit(trainingData)

pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter]) 
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, parallelism=10, seed=100)
cvModel = cv.fit(trainingData)

In [66]:
#Use the new data(test data) for testing
predictions = cvModel.transform(testData)

predictions.select("features", "label", "probability", "predictedLabel").show(5)

In [67]:
#Evaluate the best model
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

In [69]:
predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve 
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

In [70]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))

In [72]:
#  Decision Trees
#Evaluate Decision Tree Algorithm
# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")

# Train model with Training Data.
dtModel = dt.fit(trainingData)

# Make predictions on test data.
predictions = dtModel.transform(testData)

# Evaluate the model by computing the metrics. 
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

print("===============================================")

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve 
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

print("===============================================")

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))

In [73]:
# Hyperparameter tuning using 5-fold cross validation
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")

pipeline = Pipeline(stages=[featureIndexer, dt, labelConverter]) 
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, parallelism=10, seed=100)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

# Evaluate the best model
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

print("===============================================")

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve 
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

print("===============================================")

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))

In [75]:
# Random Forest
#Evaluate Random Forest Algorithm
# Create initial Random Forest Classifier
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")

# Train model with Training Data.
rfModel = rf.fit(trainingData)

# Make predictions on test data.
predictions = rfModel.transform(testData)

# Evaluate the model by computing the metrics. 
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

print("===============================================")

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve 
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

print("===============================================")

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))

In [77]:
# Hyperparameter tuning using 5-fold cross validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")

pipeline = Pipeline(stages=[featureIndexer, rf, labelConverter]) 
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, parallelism=10, seed=100)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

# Evaluate the best model
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

print("===============================================")

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
confusionMatrix = metricsMulti.confusionMatrix()
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
f1Score = metricsMulti.fMeasure(0.0,1.0) 
print("Summary Stats")
print("Confusion Matrix = \n %s" % confusionMatrix)
print("Precision = %s" % precision) 
print("Recall = %s" % recall) 
print("F1 Score = %s" % f1Score) 

# Area under precision-recall curve
print("Area under PR = %s" % metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

print("===============================================")

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="indexedLabel")
print("The area under ROC for test set is {}".format(evaluator.evaluate(predictions)))


In [None]:
from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple
initial_types = buildInitialTypesSimple(test_df.drop("label"))
onnx_model = convert_sparkml(model, 'Churn prediction model', initial_types, spark_session = spark)

from onnxmltools import convert_sparkml
from onnxmltools.utils import save_model
model_onnx = convert_sparkml(pipelineModel, 'churn prediction model', initial_types)
model_onnx