TELECOM CHURN CLASSIFICATION USING PYSPARK

In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Pipeline

Create Spark session

In [39]:
spark = SparkSession.builder.appName("TelecomChurnClassification").getOrCreate()

Load dataset

In [40]:
df = spark.read.csv("Churn_Modelling.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+----

Drop unnecessary columns

In [41]:
cols_to_drop = ['RowNumber', 'CustomerId', 'Surname']
df = df.drop(*cols_to_drop)

Handle categorical variables

In [42]:
cat_cols = ['Geography', 'Gender']
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in cat_cols]


Define feature columns

In [43]:
feature_cols = [c for c in df.columns if c not in ['Exited'] + cat_cols]
feature_cols += [c + "_index" for c in cat_cols]

Assemble features

In [44]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

Label encoding

In [45]:
label_indexer = StringIndexer(inputCol="Exited", outputCol="label")

Apply transformations

In [46]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
data = pipeline.fit(df).transform(df)

Split into train/test

In [47]:
train, test = data.randomSplit([0.8, 0.2], seed=42)

Train model

In [48]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, maxDepth=6)
model = rf.fit(train)

Predictions

In [49]:
predictions = model.transform(test)
predictions.select("label", "prediction", "probability").show(10, truncate=False)


+-----+----------+-----------------------------------------+
|label|prediction|probability                              |
+-----+----------+-----------------------------------------+
|1.0  |0.0       |[0.6916937133050756,0.30830628669492444] |
|1.0  |1.0       |[0.056149042059588386,0.9438509579404116]|
|1.0  |1.0       |[0.11692263087057611,0.8830773691294239] |
|1.0  |1.0       |[0.20541409363707086,0.7945859063629291] |
|1.0  |1.0       |[0.28644586591908777,0.7135541340809122] |
|1.0  |0.0       |[0.8709601281711206,0.1290398718288795]  |
|0.0  |0.0       |[0.9196424674626228,0.08035753253737725] |
|0.0  |0.0       |[0.854767175484065,0.14523282451593492]  |
|0.0  |0.0       |[0.8546639873737968,0.14533601262620321] |
|0.0  |0.0       |[0.9218187139416035,0.07818128605839642] |
+-----+----------+-----------------------------------------+
only showing top 10 rows



Evaluate

In [50]:
multi_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
binary_eval = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")

accuracy = multi_eval.evaluate(predictions, {multi_eval.metricName: "accuracy"})
f1 = multi_eval.evaluate(predictions, {multi_eval.metricName: "f1"})
precision = multi_eval.evaluate(predictions, {multi_eval.metricName: "weightedPrecision"})
recall = multi_eval.evaluate(predictions, {multi_eval.metricName: "weightedRecall"})
auc = binary_eval.evaluate(predictions)

print("\nClassification Report:")
print(f"Accuracy :  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"AUC-ROC  : {auc:.4f}")



Classification Report:
Accuracy :  0.8506
Precision: 0.8433
Recall   : 0.8506
F1 Score : 0.8310
AUC-ROC  : 0.8481


Stop Spark session

In [51]:
spark.stop()