In [1]:
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import parser
from pyspark.sql.functions import unix_timestamp, date_format, col, when
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import RFormula
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
import pandas as pd

In [3]:
df = spark.read.csv('abfss://synapsedatalake@synapseaiadadls.dfs.core.windows.net/pocdata.csv')

In [4]:
df.count()

In [5]:
display(df)

In [6]:
from pyspark.sql.functions import col, desc

df = spark.read.option("header", "true").csv('abfss://synapsedatalake@synapseaiadadls.dfs.core.windows.net/pocdata.csv')

In [7]:
df.createOrReplaceTempView("pocdata")
pocdata = df

In [8]:
pocdata.show

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
sI1 = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
en1 = OneHotEncoder(dropLast=False, inputCol="GenderIndex", outputCol="GenderVec")
sI2 = StringIndexer(inputCol="Segment", outputCol="SegmentIndex")
en2 = OneHotEncoder(dropLast=False, inputCol="SegmentIndex", outputCol="SegmentVec")

encoded_final_df = Pipeline(stages=[sI1, en1, sI2, en2]).fit(pocdata).transform(pocdata)

In [11]:
# Decide on the split between training and testing data from the DataFrame
trainingFraction = 0.7
testingFraction = (1-trainingFraction)
seed = 1234

# Split the DataFrame into test and training DataFrames
train_data_pocdata, test_data_pocdata = encoded_final_df.randomSplit([trainingFraction, testingFraction], seed=seed)

In [12]:
## Create a new logistic regression object for the model
logReg = LogisticRegression(maxIter=10, regParam=0.3, labelCol = 'CHURN')

## The formula for the model
classFormula = RFormula(formula="CHURN ~ CUSTID + GenderVec + Age_Band + State + LGA + Account_Open_Date + Tenure + Occupation + Education + SegmentVec + Marital_Status + No_of_Accounts + GDP_in_Billions_of_USSD + Inflation + Population + txn_amount_M1 + txn_vol_M1 + txn_amount_M2 + txn_vol_M2 + txn_amount_M3 + txn_vol_M3 + F1 + F2 + Latest_TxnDate + Recency + Freq_M1 + Freq_M2 + F3")

## Undertake training and create a logistic regression model
lrModel = Pipeline(stages=[classFormula, logReg]).fit(train_data_df)

## Saving the model is optional, but it's another form of inter-session cache
datestamp = datetime.now().strftime('%m-%d-%Y-%s')
fileName = "lrModel_"  +  datestamp
logRegDirfilename = fileName
lrModel.save(logRegDirfilename)

## Predict tip 1/0 (yes/no) on the test dataset; evaluation using area under ROC
predictions = lrModel.transform(test_data_df)
predictionAndLabels = predictions.select("label","prediction").rdd
metrics = BinaryClassificationMetrics(predictionAndLabels)
print("Area under ROC = %s" % metrics.areaUnderROC)