In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("customer_churn").getOrCreate()

## Import data

In [2]:
data = spark.read.csv("customer_churn.csv",inferSchema=True,header=True)

In [3]:
data.printSchema()
data.show(5)

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      1

## Add a column 'days_since_Onboard' until now

In [4]:
# add a column 'days_since_Onboard' until now

from pyspark.sql.functions import datediff, current_date
data = data.withColumn("days_since_Onboard",
                       datediff(current_date(),data['Onboard_date']).alias("days_since_Onboard"))

data.printSchema()
data.show(5)

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- days_since_Onboard: integer (nullable = true)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------------+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|days_since_Onboard|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------------+
|Cameron Williams|42.0|       11066.8|              0| 7.22| 

## Select only columns with numeric data types

In [5]:
cols = []
for dataPoint in data.dtypes:
    if ((dataPoint[1]=='double') or (dataPoint[1]=='int')):
        cols.append(dataPoint[0]) 

In [6]:
data_model = data.select(cols)
data_model.show(5)

+----+--------------+---------------+-----+---------+-----+------------------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|days_since_Onboard|
+----+--------------+---------------+-----+---------+-----+------------------+
|42.0|       11066.8|              0| 7.22|      8.0|    1|              1554|
|41.0|      11916.22|              0|  6.5|     11.0|    1|              1571|
|38.0|      12884.75|              0| 6.67|     12.0|    1|               520|
|42.0|       8010.76|              0| 6.71|     10.0|    1|              1319|
|37.0|       9191.58|              0| 5.56|      9.0|    1|               682|
+----+--------------+---------------+-----+---------+-----+------------------+
only showing top 5 rows



In [7]:
print('Total customers:', data_model.count())
print('Customers with no account manager:', data_model.filter(data_model['Account_Manager']==0).count())
print('Customers with account manager:', data_model.filter(data_model['Account_Manager']==1).count())

Total customers: 900
Customers with no account manager: 467
Customers with account manager: 433


In [8]:
from pyspark.sql.functions import corr
data.select(corr("Churn","Account_Manager")).show()

+----------------------------+
|corr(Churn, Account_Manager)|
+----------------------------+
|         0.07061077173214911|
+----------------------------+



## Some correlation between 'Churn' and 'Account_Manager' but it is small

## The goal is to predict whether a customer will churn without an account manager assigned
## Since account manager is randomly assigned, let's not put it in the model and select only customer with no account manager

In [9]:
data_model = data_model.filter(data_model['Account_Manager']==0)
print('Customer with no account manager to use in the model:', data_model.count())

data_model.show(5)

Customer with no account manager to use in the model: 467
+----+--------------+---------------+-----+---------+-----+------------------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|days_since_Onboard|
+----+--------------+---------------+-----+---------+-----+------------------+
|42.0|       11066.8|              0| 7.22|      8.0|    1|              1554|
|41.0|      11916.22|              0|  6.5|     11.0|    1|              1571|
|38.0|      12884.75|              0| 6.67|     12.0|    1|               520|
|42.0|       8010.76|              0| 6.71|     10.0|    1|              1319|
|37.0|       9191.58|              0| 5.56|      9.0|    1|               682|
+----+--------------+---------------+-----+---------+-----+------------------+
only showing top 5 rows



In [10]:
data_model = data_model.drop('Account_Manager')
data_model.show(5)

+----+--------------+-----+---------+-----+------------------+
| Age|Total_Purchase|Years|Num_Sites|Churn|days_since_Onboard|
+----+--------------+-----+---------+-----+------------------+
|42.0|       11066.8| 7.22|      8.0|    1|              1554|
|41.0|      11916.22|  6.5|     11.0|    1|              1571|
|38.0|      12884.75| 6.67|     12.0|    1|               520|
|42.0|       8010.76| 6.71|     10.0|    1|              1319|
|37.0|       9191.58| 5.56|      9.0|    1|               682|
+----+--------------+-----+---------+-----+------------------+
only showing top 5 rows



In [11]:
vecCols = data_model.columns
vecCols.remove('Churn')
vecCols

['Age', 'Total_Purchase', 'Years', 'Num_Sites', 'days_since_Onboard']

In [12]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=vecCols,outputCol='features')
data_feed = assembler.transform(data_model).select('Churn','features')
data_feed.show(5)

+-----+--------------------+
|Churn|            features|
+-----+--------------------+
|    1|[42.0,11066.8,7.2...|
|    1|[41.0,11916.22,6....|
|    1|[38.0,12884.75,6....|
|    1|[42.0,8010.76,6.7...|
|    1|[37.0,9191.58,5.5...|
+-----+--------------------+
only showing top 5 rows



In [13]:
train_data, test_data = data_feed.randomSplit([0.7, 0.3])

## Build a model

In [14]:
from pyspark.ml.classification import LogisticRegression

logReg = LogisticRegression(featuresCol='features',labelCol='Churn')
logReg_trained = logReg.fit(train_data)

In [15]:
logReg_trained.summary.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|Churn|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|[25.0,9672.03,5.4...|[6.10755038453111...|[0.99777894708088...|       0.0|
|  0.0|[28.0,11204.23,3....|[2.57296956546668...|[0.92910155613014...|       0.0|
|  0.0|[28.0,11245.38,6....|[3.94298207931874...|[0.98097852714545...|       0.0|
|  0.0|[29.0,9378.24,4.9...|[4.96194947726990...|[0.99304937965899...|       0.0|
|  0.0|[29.0,9617.59,5.4...|[4.91974328684861...|[0.99275191344503...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [16]:
test_results = logReg_trained.transform(test_data)

In [17]:
test_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|Churn|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|[26.0,8939.61,4.5...|[7.44290306906282...|[0.99941475996603...|       0.0|
|    0|[28.0,8670.98,3.9...|[8.70498946309901...|[0.99983427076339...|       0.0|
|    0|[30.0,6744.87,5.1...|[3.89790318797779...|[0.98011887717878...|       0.0|
|    0|[30.0,8874.83,5.5...|[4.34816101815511...|[0.98723449543560...|       0.0|
|    0|[30.0,12788.37,4....|[3.37787740922448...|[0.96700595009313...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Use .evaluate to get more complete info of evaluation on the test_data

In [18]:
test_results_2 = logReg_trained.evaluate(test_data)

In [19]:
test_results_2.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|Churn|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|[26.0,8939.61,4.5...|[7.44290306906282...|[0.99941475996603...|       0.0|
|    0|[28.0,8670.98,3.9...|[8.70498946309901...|[0.99983427076339...|       0.0|
|    0|[30.0,6744.87,5.1...|[3.89790318797779...|[0.98011887717878...|       0.0|
|    0|[30.0,8874.83,5.5...|[4.34816101815511...|[0.98723449543560...|       0.0|
|    0|[30.0,12788.37,4....|[3.37787740922448...|[0.96700595009313...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [20]:
print(type(logReg_trained.summary))
print(type(test_results))
print(type(test_results_2))

<class 'pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary'>
<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.ml.classification.BinaryLogisticRegressionSummary'>


## Evaluation

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [22]:
print("Area under ROC:")
bi_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Churn', metricName='areaUnderROC')
bi_eval.evaluate(test_results_2.predictions)

Area under ROC:


0.9125631313131312

In [23]:
print("Easier way to get Area under ROC:")
test_results_2.areaUnderROC

Easier way to get Area under ROC:


0.9125631313131312

In [24]:
print("Accuracy:")
multi_eval = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='Churn',metricName='accuracy')
multi_eval.evaluate(test_results_2.predictions)

Accuracy:


0.8846153846153846

# Make predictions on new_customer.csv

## Re-train the model with all data

In [25]:
logReg_trained_new = logReg.fit(data_feed)

In [26]:
logReg_trained_new.summary.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|Churn|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  1.0|[42.0,11066.8,7.2...|[2.23652653000433...|[0.90348198797847...|       0.0|
|  1.0|[41.0,11916.22,6....|[-0.5986565768745...|[0.35465110792153...|       1.0|
|  1.0|[38.0,12884.75,6....|[-1.7123122602926...|[0.15286404544922...|       1.0|
|  1.0|[42.0,8010.76,6.7...|[0.38729397854017...|[0.59563111035065...|       0.0|
|  1.0|[37.0,9191.58,5.5...|[2.73066510395521...|[0.93881205455145...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Prepare new customer data

In [27]:
new_cus = spark.read.csv("new_customers.csv",inferSchema=True,header=True)

In [28]:
new_cus.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [29]:
new_cus.show(5)

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

In [30]:
new_cus = new_cus.withColumn("days_since_Onboard",
                       datediff(current_date(),new_cus['Onboard_date']).alias("days_since_Onboard"))

In [31]:
new_cus.show(5)

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+------------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|days_since_Onboard|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+------------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|              2286|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|              1593|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|              4008|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golde

In [32]:
from copy import deepcopy
new_cus_cols = deepcopy(cols)
new_cus_cols.remove('Churn')
new_cus_cols.remove('Account_Manager')

In [33]:
# Add 'Names' column to keep track of which company will churn
new_cus_data = new_cus.select(['Names'] + new_cus_cols)
new_cus_data.show(5)

+--------------+----+--------------+-----+---------+------------------+
|         Names| Age|Total_Purchase|Years|Num_Sites|days_since_Onboard|
+--------------+----+--------------+-----+---------+------------------+
| Andrew Mccall|37.0|       9935.53| 7.71|      8.0|              2286|
|Michele Wright|23.0|       7526.94| 9.28|     15.0|              1593|
|  Jeremy Chang|65.0|         100.0|  1.0|     15.0|              4008|
|Megan Ferguson|32.0|        6487.5|  9.4|     14.0|               399|
|  Taylor Young|32.0|      13147.71| 10.0|      8.0|              2082|
+--------------+----+--------------+-----+---------+------------------+
only showing top 5 rows



In [34]:
new_cus_assembler = VectorAssembler(inputCols=new_cus_cols,outputCol='features')
new_cus_feed = new_cus_assembler.transform(new_cus_data).select('Names','features')
new_cus_feed.show()

+--------------+--------------------+
|         Names|            features|
+--------------+--------------------+
| Andrew Mccall|[37.0,9935.53,7.7...|
|Michele Wright|[23.0,7526.94,9.2...|
|  Jeremy Chang|[65.0,100.0,1.0,1...|
|Megan Ferguson|[32.0,6487.5,9.4,...|
|  Taylor Young|[32.0,13147.71,10...|
| Jessica Drake|[22.0,8445.26,3.4...|
+--------------+--------------------+



## Make predictions

In [35]:
new_cus_pred = logReg_trained_new.transform(new_cus_feed)

In [36]:
new_cus_pred.show()

+--------------+--------------------+--------------------+--------------------+----------+
|         Names|            features|       rawPrediction|         probability|prediction|
+--------------+--------------------+--------------------+--------------------+----------+
| Andrew Mccall|[37.0,9935.53,7.7...|[2.64005513006689...|[0.93339539185674...|       0.0|
|Michele Wright|[23.0,7526.94,9.2...|[-4.7916586352367...|[0.00823038017150...|       1.0|
|  Jeremy Chang|[65.0,100.0,1.0,1...|[-3.3301432046926...|[0.03455145299582...|       1.0|
|Megan Ferguson|[32.0,6487.5,9.4,...|[-4.9249814260398...|[0.00721049237351...|       1.0|
|  Taylor Young|[32.0,13147.71,10...|[1.50674699488292...|[0.81857861120005...|       0.0|
| Jessica Drake|[22.0,8445.26,3.4...|[0.51843421908199...|[0.62678156153494...|       0.0|
+--------------+--------------------+--------------------+--------------------+----------+



## Need to re-consider whether to drop not to drop 'Account_Manager' in train data