In [44]:
from pyspark.sql import SparkSession

In [45]:
spark = SparkSession.builder.appName('customers').getOrCreate()

In [46]:
df = spark.read.csv('customer_churn.csv' , inferSchema= True , header= True)

In [47]:
df.show(2)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|   Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|Wilson PLC|    1|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------+-----+
only showing top 2 rows



In [48]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [49]:
# selecting the needed columns 

df_new = df.select([
    'Age' , 
    'Total_Purchase' , 
    'Years' , 
    'Num_Sites' , 
    'Churn'
])

In [50]:
df_new.show(1)

+----+--------------+-----+---------+-----+
| Age|Total_Purchase|Years|Num_Sites|Churn|
+----+--------------+-----+---------+-----+
|42.0|       11066.8| 7.22|      8.0|    1|
+----+--------------+-----+---------+-----+
only showing top 1 row



In [51]:
df_new.columns

['Age', 'Total_Purchase', 'Years', 'Num_Sites', 'Churn']

In [52]:
from pyspark.sql.functions import isnan, when, count, col

In [53]:
# count of nulls 
df_new.select([count(when(isnan(c), c)).alias(c) for c in df_new.columns]).show()

+---+--------------+-----+---------+-----+
|Age|Total_Purchase|Years|Num_Sites|Churn|
+---+--------------+-----+---------+-----+
|  0|             0|    0|        0|    0|
+---+--------------+-----+---------+-----+



In [54]:
from pyspark.ml.feature import VectorAssembler

In [55]:
assumbler = VectorAssembler(inputCols=[
    'Age',
    'Total_Purchase', 
    'Years', 
    'Num_Sites'
] , outputCol= 'features')

In [56]:
output = assumbler.transform(df)

In [57]:
final_data = output.select('features' , 'Churn')

In [58]:
# splitting the data 

train_data , test_data = final_data.randomSplit([0.7 , 0.3]) 

In [59]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline 

In [60]:
lr_churn = LogisticRegression( featuresCol='features' , 
                                     labelCol= 'Churn')


In [61]:
fitted_churn_model = lr_churn.fit(train_data)

In [62]:
# show summary 

training_summary = fitted_churn_model.summary

In [63]:
training_summary.predictions.describe().show()



+-------+------------------+------------------+
|summary|             Churn|        prediction|
+-------+------------------+------------------+
|  count|               628|               628|
|   mean|0.1751592356687898|0.1321656050955414|
| stddev|0.3804062381494544|0.3389406867257665|
|    min|               0.0|               0.0|
|    max|               1.0|               1.0|
+-------+------------------+------------------+



In [64]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [65]:
pred_and_labels = fitted_churn_model.evaluate(test_data)

In [66]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,5.4...|    1|[0.76972650037697...|[0.68346172730619...|       0.0|
|[27.0,8628.8,5.3,...|    0|[5.28218130862694...|[0.99494435469028...|       0.0|
|[28.0,11204.23,3....|    0|[1.47160163497859...|[0.81330070460468...|       0.0|
|[29.0,5900.78,5.5...|    0|[4.16612223056924...|[0.98472465886991...|       0.0|
|[29.0,12711.15,5....|    0|[4.54980268433604...|[0.98954125187394...|       0.0|
|[30.0,7960.64,2.7...|    1|[3.34871518275153...|[0.96606273764776...|       0.0|
|[30.0,12788.37,4....|    0|[1.97257162561516...|[0.87788706297148...|       0.0|
|[30.0,13473.35,3....|    0|[2.18392051969072...|[0.89879624664540...|       0.0|
|[31.0,5304.6,5.29...|    0|[3.16260075892582...|[0.95940236571893...|       0.0|
|[31.0,7073.61,5

In [67]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol= 'prediction' ,
                                            labelCol= 'Churn')

In [68]:
# Area Under the Curve 

AUC = churn_eval.evaluate(pred_and_labels.predictions)

In [69]:
AUC

0.7202586206896552

In [70]:
# Mimic applying the model to new data 

# First I need to apply the model to the whole dataset 

final_lr_model = lr_churn.fit(final_data)

In [71]:
new_customers = spark.read.csv('new_customers.csv' , inferSchema=True ,header= True)

In [72]:
# making sure it's same data & format 
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [73]:
test_new_customers = assumbler.transform(new_customers)

In [74]:
final_result = final_lr_model.transform(test_new_customers)

In [75]:
final_result.select('Company' , 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

