# Research Question: Which Customers Will Churn?

## Import Spark Session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("churn").getOrCreate()

## Import Data

In [0]:
import pandas as pd

In [0]:
local_file_path_1 = "customer_churn.csv"
pandas_df_1 = pd.read_csv(local_file_path_1)
df = spark.createDataFrame(pandas_df_1)

In [0]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: long (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: long (nullable = true)



In [0]:
#Rename the column names
current_columns = df.columns
df_renamed = df #New data frame with renamed columns 
for old_col in current_columns:
    new_col = old_col.lower()
    df_renamed = df_renamed.withColumnRenamed(old_col, new_col)
print(df_renamed.columns)

['names', 'age', 'total_purchase', 'account_manager', 'years', 'num_sites', 'onboard_date', 'location', 'company', 'churn']


In [0]:
df_renamed.describe().show()

+-------+-------------+-----------------+------------------+-------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        names|              age|    total_purchase|    account_manager|            years|         num_sites|       onboard_date|            location|             company|              churn|
+-------+-------------+-----------------+------------------+-------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|               900|                900|              900|               900|                900|                 900|                 900|                900|
|   mean|         NULL|41.81666666666667|10062.824033333332| 0.4811111111111111|5.273155555555555| 8.587777777777777|               NULL|                NULL|                NULL|0.16666666666666666|


### Transform into Vector-Type Column

In [0]:
df_renamed.columns

['names',
 'age',
 'total_purchase',
 'account_manager',
 'years',
 'num_sites',
 'onboard_date',
 'location',
 'company',
 'churn']

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler 

In [0]:
assembler = VectorAssembler(inputCols = ['age','total_purchase','account_manager','years','num_sites'], outputCol = 'features')
output = assembler.transform(df_renamed)

In [0]:
final_df = output.select('features','churn')
display(final_df.limit(5))

features,churn
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""42.0"",""11066.8"",""0.0"",""7.22"",""8.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""41.0"",""11916.22"",""0.0"",""6.5"",""11.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""38.0"",""12884.75"",""0.0"",""6.67"",""12.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""42.0"",""8010.76"",""0.0"",""6.71"",""10.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""37.0"",""9191.58"",""0.0"",""5.56"",""9.0""]}",1


## Develop and Train a Model

### Split Data

In [0]:
SEED_VALUE = 42

train_data, test_data = final_df.randomSplit([0.8, 0.2])

### Develop, Train, and Evaluate a Model

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
lr_model = LogisticRegression(labelCol='churn')
lr_model_trained = lr_model.fit(train_data) 

training_sum = lr_model_trained.summary
training_sum.predictions.show(10) 

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[29.0,8688.17,1.0...|  1.0|[2.59714423106846...|[0.93067756092623...|       0.0|
|[30.0,7960.64,1.0...|  1.0|[2.95656069389549...|[0.95057265075862...|       0.0|
|[30.0,11575.37,1....|  1.0|[3.84413795872138...|[0.97904372045464...|       0.0|
|[32.0,8617.98,1.0...|  1.0|[0.98148452173516...|[0.72740267857100...|       0.0|
|[32.0,9885.12,1.0...|  1.0|[1.73382740116944...|[0.84990133237492...|       0.0|
|[34.0,9228.84,1.0...|  1.0|[-1.0748199518122...|[0.25448754376004...|       1.0|
|[35.0,9381.12,1.0...|  1.0|[-0.7209699959936...|[0.32717941897468...|       1.0|
|[35.0,12403.81,0....|  1.0|[0.70644887375413...|[0.66961601644314...|       0.0|
|[36.0,6447.99,1.0...|  1.0|[0.04893525271612...|[0.51223137244597...|       0.0|
|[36.0,9793.42,0

In [0]:
evaluation = lr_model_trained.evaluate(test_data)

auc = evaluation.areaUnderROC
print(f"Test Data AUC: {auc}")

Test Data AUC: 0.9143695014662756


## Predict on New Data

In [0]:
actual_cols = ['Names', 'Age', 'Total_Purchase', 'Account_manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company']

local_file_path_2 = "new_customers.csv"
pandas_df_2 = pd.read_csv(local_file_path_2, usecols = range(9), names = actual_cols, header = 0)

df_new = spark.createDataFrame(pandas_df_2)

current_columns = df_new.columns
df_new_renamed = df_new 

for old_col in current_columns:
    new_col = old_col.lower()
    df_new_renamed = df_new_renamed.withColumnRenamed(old_col, new_col)

df_final = df_new_renamed.withColumn("num_sites", col("num_sites").cast(DoubleType()))

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['age', 'total_purchase', 'account_manager', 'years', 'num_sites'], outputCol='features')
output = assembler.transform(df_final)

final_predictions = lr_model_trained.transform(output)
final_predictions.select('names', 'probability', 'prediction').show(truncate=False)

+--------------+------------------------------------------+----------+
|names         |probability                               |prediction|
+--------------+------------------------------------------+----------+
|Andrew Mccall |[0.9050875113058656,0.09491248869413438]  |0.0       |
|Michele Wright|[0.0021961837462762526,0.9978038162537237]|1.0       |
|Jeremy Chang  |[0.028461741673922836,0.9715382583260772] |1.0       |
|Megan Ferguson|[0.006373130080246124,0.9936268699197539] |1.0       |
|Taylor Young  |[0.7566076189489592,0.24339238105104077]  |0.0       |
|Jessica Drake |[0.13644013994119716,0.8635598600588028]  |1.0       |
+--------------+------------------------------------------+----------+

