In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('customerchurn').getOrCreate()

In [3]:
from google.colab import files
uploaded = files.upload()

Saving customer_churn.csv to customer_churn.csv


In [4]:
df = spark.read.csv('customer_churn.csv', inferSchema = True, header = True)

In [5]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [6]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         NULL|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                NULL|                NULL|0.16666666666666666|
| stddev|         NULL|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

# **Assess correlations between the features and the target**

In [7]:
from scipy.stats import shapiro

feature_cols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites']

results = {}

for col in feature_cols:

    sampled_data = df.select(col).dropna().sample(fraction = 0.1, seed = 42).toPandas()[col]

    if len(sampled_data) < 3:
        print(f"{col}: data size is too small")
        continue

    stat, p_value = shapiro(sampled_data)
    results[col] = (stat, p_value)

    print(f"{col} - Shapiro-Wilk: {stat:.4f}, p-value: {p_value:.4f}")

    if p_value > 0.05:

        print(f"  => ✅ {col} follows a normal distribution.\n")

    else:

        print(f"  => {col} dows not follow a normal distribution.\n")

Age - Shapiro-Wilk: 0.9895, p-value: 0.6616
  => ✅ Age follows a normal distribution.

Total_Purchase - Shapiro-Wilk: 0.9668, p-value: 0.0164
  => Total_Purchase dows not follow a normal distribution.

Account_Manager - Shapiro-Wilk: 0.6344, p-value: 0.0000
  => Account_Manager dows not follow a normal distribution.

Years - Shapiro-Wilk: 0.9924, p-value: 0.8681
  => ✅ Years follows a normal distribution.

Num_Sites - Shapiro-Wilk: 0.9414, p-value: 0.0003
  => Num_Sites dows not follow a normal distribution.



In [10]:
from scipy.stats import ttest_ind, mannwhitneyu, shapiro

for col in feature_cols:
    pdf = df.select(col, 'Churn').dropna().toPandas()

    group_yes = pdf[pdf['Churn'] == 1][col]
    group_no = pdf[pdf['Churn'] == 0][col]

    pval_yes = shapiro(group_yes)[1]
    pval_no = shapiro(group_no)[1]

    if pval_yes > 0.05 and pval_no > 0.05:

        t_stat, p_val = ttest_ind(group_yes, group_no, equal_var=False)

        test_name = "t-test"

    else:

        u_stat, p_val = mannwhitneyu(group_yes, group_no, alternative='two-sided')

        test_name = "Mann-Whitney"

    print(f"{col}: {test_name} p-value = {p_val:.4g}")

Age: t-test p-value = 0.01234
Total_Purchase: t-test p-value = 0.4531
Account_Manager: Mann-Whitney p-value = 0.03426
Years: t-test p-value = 1.132e-10
Num_Sites: Mann-Whitney p-value = 5.791e-50


# **Select predictors**

In [28]:
from pyspark.ml.feature import VectorAssembler

In [29]:
assembler = VectorAssembler(inputCols = ['Age', 'Account_Manager', 'Years', 'Num_Sites'], outputCol = 'features')

In [30]:
output = assembler.transform(df)

In [31]:
final_df = output.select('features', 'churn')

# **Develop a logistic regression model**

In [32]:
train_churn, test_churn = final_df.randomSplit([0.7, 0.3])

In [33]:
from pyspark.ml.classification import LogisticRegression

In [34]:
logistic_regression = LogisticRegression(labelCol = 'churn', featuresCol='features')

In [35]:
fitted_model = logistic_regression.fit(train_churn)

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
predictions = fitted_model.transform(test_churn)

In [37]:
binary_evaluator = BinaryClassificationEvaluator(
    labelCol='churn',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)
auc = binary_evaluator.evaluate(predictions)
print(f"AUC (Area Under ROC): {auc:.4f}")

AUC (Area Under ROC): 0.9435


In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

precision_eval = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="prediction", metricName="precisionByLabel")
recall_eval = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="prediction", metricName="recallByLabel")
f1_eval = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="prediction", metricName="f1")

precision = precision_eval.evaluate(predictions, {precision_eval.metricLabel: 1.0})
recall = recall_eval.evaluate(predictions, {recall_eval.metricLabel: 1.0})
f1 = f1_eval.evaluate(predictions)

print(f"Precision (class 1): {precision:.4f}")
print(f"Recall (class 1):    {recall:.4f}")
print(f"F1 Score:            {f1:.4f}")

Precision (class 1): 0.7353
Recall (class 1):    0.6944
F1 Score:            0.9187


# **Predict on new data**

In [39]:
from google.colab import files
uploaded = files.upload()

Saving new_customers.csv to new_customers.csv


In [40]:
new_customers = spark.read.csv('new_customers.csv', inferSchema = True, header = True)

In [41]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [42]:
assembler = VectorAssembler(inputCols = ['Age', 'Account_Manager', 'Years', 'Num_Sites'], outputCol = 'features')

In [43]:
test_new_customers = assembler.transform(new_customers)

In [44]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [45]:
final_results = fitted_model.transform(test_new_customers)

In [46]:
final_results.select('Names', 'prediction').show()

+--------------+----------+
|         Names|prediction|
+--------------+----------+
| Andrew Mccall|       0.0|
|Michele Wright|       1.0|
|  Jeremy Chang|       1.0|
|Megan Ferguson|       1.0|
|  Taylor Young|       0.0|
| Jessica Drake|       1.0|
+--------------+----------+

