In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('customerchurn').getOrCreate()

In [2]:
from google.colab import files
uploaded = files.upload()

Saving customer_churn.csv to customer_churn.csv


In [4]:
df = spark.read.csv('customer_churn.csv', inferSchema = True, header = True)

In [5]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [6]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         NULL|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                NULL|                NULL|0.16666666666666666|
| stddev|         NULL|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

# **Assess correlations between the features and the target**

In [8]:
from scipy.stats import shapiro

feature_cols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites']

results = {}

for col in feature_cols:

    sampled_data = df.select(col).dropna().sample(fraction = 0.1, seed = 42).toPandas()[col]

    if len(sampled_data) < 3:
        print(f"{col}: data size is too small")
        continue

    stat, p_value = shapiro(sampled_data)
    results[col] = (stat, p_value)

    print(f"{col} - Shapiro-Wilk: {stat:.4f}, p-value: {p_value:.4f}")

    if p_value > 0.05:

        print(f"  => {col} follows a normal distribution.\n")

    else:

        print(f"  => {col} dows not follow a normal distribution.\n")

Age - Shapiro-Wilk: 0.9895, p-value: 0.6616
  => Age follows a normal distribution.

Total_Purchase - Shapiro-Wilk: 0.9668, p-value: 0.0164
  => Total_Purchase dows not follow a normal distribution.

Account_Manager - Shapiro-Wilk: 0.6344, p-value: 0.0000
  => Account_Manager dows not follow a normal distribution.

Years - Shapiro-Wilk: 0.9924, p-value: 0.8681
  => Years follows a normal distribution.

Num_Sites - Shapiro-Wilk: 0.9414, p-value: 0.0003
  => Num_Sites dows not follow a normal distribution.



In [11]:
from scipy.stats import pearsonr, spearmanr

feature_cols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites']

for col in feature_cols:
    pdf = df.select(col, 'Churn').dropna().toPandas()

    pearson_corr, pearson_p = pearsonr(pdf[col], pdf['Churn'])
    spearman_corr, spearman_p = spearmanr(pdf[col], pdf['Churn'])

    print(f"{col}: Pearson={pearson_corr:.4f} (p={pearson_p:.4g}), Spearman={spearman_corr:.4f} (p={spearman_p:.4g})")

Age: Pearson=0.0859 (p=0.00991), Spearman=0.0800 (p=0.01632)
Total_Purchase: Pearson=0.0240 (p=0.4715), Spearman=0.0196 (p=0.5575)
Account_Manager: Pearson=0.0706 (p=0.03417), Spearman=0.0706 (p=0.03417)
Years: Pearson=0.2143 (p=8.218e-11), Spearman=0.2177 (p=4.054e-11)
Num_Sites: Pearson=0.5254 (p=5.181e-65), Spearman=0.4957 (p=5.537e-57)


# **Select predictors**

In [7]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols = ['Age', 'Account_Manager', 'Years', 'Num_Sites'], outputCol = 'features')

In [13]:
output = assembler.transform(df)

In [40]:
final_df = output.select('features', 'churn')

# **Develop a logistic regression model**

In [42]:
train_churn, test_churn = final_df.randomSplit([0.7, 0.3])

In [43]:
from pyspark.ml.classification import LogisticRegression

In [44]:
logistic_regression = LogisticRegression(labelCol = 'churn')

In [45]:
fitted_model = logistic_regression.fit(train_churn)

In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [47]:
pred_and_labels = fitted_model.evaluate(test_churn)

In [48]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,1.0,5.42,11.0]|    1|[0.55584650580533...|[0.63549095827084...|       0.0|
|[28.0,0.0,3.67,11.0]|    0|[1.90662361473138...|[0.87063935314390...|       0.0|
| [28.0,0.0,6.72,8.0]|    0|[3.58184576019409...|[0.97292893929239...|       0.0|
| [29.0,1.0,4.43,8.0]|    0|[4.69001807612252...|[0.99089710384043...|       0.0|
| [29.0,1.0,4.89,8.0]|    0|[4.38724034778575...|[0.98771773179901...|       0.0|
|[30.0,0.0,3.84,10.0]|    0|[2.88883161675547...|[0.94729157437856...|       0.0|
| [32.0,0.0,4.26,9.0]|    0|[3.70648650555306...|[0.97602523225719...|       0.0|
|[32.0,0.0,5.35,10.0]|    0|[1.76144488961012...|[0.85339053007727...|       0.0|
|  [32.0,0.0,5.9,8.0]|    0|[3.85460812506254...|[0.97925746403062...|       0.0|
|[32.0,0.0,7.14,

In [49]:
print("AUC:", pred_and_labels.areaUnderROC)

AUC: 0.9083776595744705


In [50]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

precision_evaluator = MulticlassClassificationEvaluator(labelCol="churn", predictionCol="prediction", metricName="precisionByLabel")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="churn", predictionCol="prediction", metricName="recallByLabel")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="churn", predictionCol="prediction", metricName="f1")

precision = precision_evaluator.evaluate(predictions, {precision_evaluator.metricLabel: 1.0})
recall = recall_evaluator.evaluate(predictions, {recall_evaluator.metricLabel: 1.0})
f1 = f1_evaluator.evaluate(predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Precision: 0.7419
Recall:    0.5349
F1 Score:  0.8771


# **Predict on new data**

In [29]:
from google.colab import files
uploaded = files.upload()

Saving new_customers.csv to new_customers.csv


In [30]:
new_customers = spark.read.csv('new_customers.csv', inferSchema = True, header = True)

In [31]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [32]:
assembler = VectorAssembler(inputCols = ['Age', 'Account_Manager', 'Years', 'Num_Sites'], outputCol = 'features')

In [33]:
test_new_customers = assembler.transform(new_customers)

In [34]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [55]:
final_results = fitted_model.transform(test_new_customers)

In [57]:
final_results.select('Names', 'prediction').show()

+--------------+----------+
|         Names|prediction|
+--------------+----------+
| Andrew Mccall|       0.0|
|Michele Wright|       1.0|
|  Jeremy Chang|       1.0|
|Megan Ferguson|       1.0|
|  Taylor Young|       0.0|
| Jessica Drake|       1.0|
+--------------+----------+

