In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=08c9ef44c869b348e8a3186af294c0acc112707c30cabfdbfc382fe5adbee1c4
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
# Loading Libraries

from pyspark.sql import SparkSession #
from pyspark.ml.feature import StringIndexer , VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

Loading of Dataset which is Churn.csv


In [3]:
spark = SparkSession.builder.appName("churn classification").getOrCreate()

In [6]:
data = spark.read.csv('churn.csv',header = True,inferSchema = True)

In [8]:
data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- CallService: string (nullable = true)
 |-- MultipleConnections: string (nullable = true)
 |-- InternetConnection: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtectionService: string (nullable = true)
 |-- TechnicalHelp: string (nullable = true)
 |-- OnlineTV: string (nullable = true)
 |-- OnlineMovies: string (nullable = true)
 |-- Agreement: string (nullable = true)
 |-- BillingMethod: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyServiceCharges: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- Churn: string (nullable = true)



Total no. of rows -- 12336

Total no. of columns -- 21(including churn)



Total no. of (churn = "No") -- 6728

Total no. of (churn = "yes") -- 5607




In [45]:
data.show(4)

+----------+------+-------------+-------+----------+------+-----------+-------------------+------------------+-------------------+-------------------+-----------------------+-------------------+-------------------+-------------------+--------------+-------------+--------------+---------------------+-----------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|CallService|MultipleConnections|InternetConnection|     OnlineSecurity|       OnlineBackup|DeviceProtectionService|      TechnicalHelp|           OnlineTV|       OnlineMovies|     Agreement|BillingMethod| PaymentMethod|MonthlyServiceCharges|TotalAmount|Churn|
+----------+------+-------------+-------+----------+------+-----------+-------------------+------------------+-------------------+-------------------+-----------------------+-------------------+-------------------+-------------------+--------------+-------------+--------------+---------------------+-----------+-----+
|2907-ILJBN|Female|            0|    Yes|  

* Explain the Datasets ?

1.  customerID -- Every value is unique.
2.  Gender-- No. of males = 6216 ,  No. of Females = 6119
3.  partner - Yes , NO
4.  PaymentMethod -- Mailed check , Credit card , Bank transfer
5.  MonthlyServiceCharges -- Amount charges on customer on monthly basis.
6.  TotalAmount -- Total Amount of each customer .
7.  Churn --  No = means Customer will stick to platform, while (yes) -- means customer will stop buying their service .






In [136]:
data.describe()
 # mostly data are in string format so we have transform data into numerical
 # numerical values . Because we have to apply machine learnig techinque
 # which will be applied on numerical data .




DataFrame[summary: string, customerID: string, gender: string, SeniorCitizen: string, Partner: string, Dependents: string, tenure: string, CallService: string, MultipleConnections: string, InternetConnection: string, OnlineSecurity: string, OnlineBackup: string, DeviceProtectionService: string, TechnicalHelp: string, OnlineTV: string, OnlineMovies: string, Agreement: string, BillingMethod: string, PaymentMethod: string, MonthlyServiceCharges: string, TotalAmount: string, Churn: string]

Data Preprocessing ----
1. StringIndexer - It will change categorical string columns  into numerical columns .
eg. male - 0, while Female - 1

2. VectorAssembler - It is used to combine multiple columns into a single vector column. This is important step
because it is used as a input in machine
learning models.

In [48]:
indexer = StringIndexer(inputCols=  ["gender","Partner","Dependents","CallService",
                                     "MultipleConnections","InternetConnection","OnlineSecurity",
                                     "DeviceProtectionService","TechnicalHelp","OnlineTV","OnlineMovies",
                                     "Agreement","BillingMethod","PaymentMethod",
                                     "Churn"],

                                     outputCols = ["gender_index","Partner_index","Dependents_index",
                                                   "CallService_index","MultipleConnections_index","InternetConnection_index",
                                                   "OnlineSecurity_index","DeviceProtectionService_index","TechnicalHelp_index","OnlineTV_index",
                                                   "OnlineMovies_index","Agreement_index","BillingMethod_index","PaymentMethod_index",
                                                    "Churn_index"])
indexed_data = indexer.fit(data).transform(data)

In [49]:
indexed_data.show(5)

+----------+------+-------------+-------+----------+------+-----------+-------------------+------------------+-------------------+-------------------+-----------------------+-------------------+-------------------+-------------------+--------------+-------------+----------------+---------------------+-----------+-----+------------+-------------+----------------+-----------------+-------------------------+------------------------+--------------------+-----------------------------+-------------------+--------------+------------------+---------------+-------------------+-------------------+-----------+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|CallService|MultipleConnections|InternetConnection|     OnlineSecurity|       OnlineBackup|DeviceProtectionService|      TechnicalHelp|           OnlineTV|       OnlineMovies|     Agreement|BillingMethod|   PaymentMethod|MonthlyServiceCharges|TotalAmount|Churn|gender_index|Partner_index|Dependents_index|CallService_index|MultipleCon

In [56]:
assembler = VectorAssembler(inputCols=["gender_index","Partner_index","Dependents_index",
                                      "CallService_index","MultipleConnections_index","InternetConnection_index",
                                      "OnlineSecurity_index","DeviceProtectionService_index","TechnicalHelp_index","OnlineTV_index",
                                      "OnlineMovies_index","Agreement_index","BillingMethod_index","PaymentMethod_index",
                                       "tenure","MonthlyServiceCharges","TotalAmount"],
                            outputCol = "features" )
final_data = assembler.transform(indexed_data)

In [57]:
selected_data = final_data.select("features","Churn_index")
# In the above churn is categorical Columns which we transform into numerical columns
# to apply further techniques.

In [58]:
selected_data.show(5)

+--------------------+-----------+
|            features|Churn_index|
+--------------------+-----------+
|[1.0,0.0,1.0,0.0,...|        0.0|
|[1.0,1.0,0.0,1.0,...|        0.0|
|[1.0,0.0,0.0,0.0,...|        0.0|
|[0.0,0.0,1.0,0.0,...|        0.0|
|(17,[0,1,4,9,10,1...|        0.0|
+--------------------+-----------+
only showing top 5 rows



Split the Data into Training and Test sets.
And also applying 70:30 rule of spliting data .

In [59]:
train_data,test_data = selected_data.randomSplit([0.7,0.3],seed =42)


Build and Training the Classification Model ----

we will use LogisticRegression here

In [69]:
lr = LogisticRegression(featuresCol = 'features',labelCol ="Churn_index")
model = lr.fit(train_data) # fitting the model

In [70]:
pred = model.transform(test_data) # prediction

Evalute the model ----

We will use BinaryClassificationEvaluator
from PySpark MLlib to compute the accuracy of the model .


In [71]:
evaluator = BinaryClassificationEvaluator(labelCol = "Churn_index")

In [80]:
accuracy = evaluator.evaluate(pred)

In [73]:
print(f"Model Accuracy : {accuracy}")

Model Accuracy : 0.8379817590132912


In [74]:
summary = model.summary

In [75]:
summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|        Churn_index|         prediction|
+-------+-------------------+-------------------+
|  count|               5521|               5521|
|   mean|0.12859989132403551|0.05596812171708024|
| stddev| 0.3347868879676453|  0.229880975184348|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



Again applying LogisticRegression to increase the accuracy


In [83]:
model1 = LogisticRegression(labelCol = "Churn_index")
model1 = model1.fit(selected_data)

In [84]:
results = model1.transform(selected_data)

In [91]:
final_accuracy = evaluator.evaluate(results)

In [92]:
print(final_accuracy) # final accuracy

0.8564944188541112


Churned Percentage ----

In [144]:
final_summary = model1.summary

In [145]:
final_summary.predictions.describe().show() #  final summary

+-------+-------------------+-------------------+
|summary|        Churn_index|         prediction|
+-------+-------------------+-------------------+
|  count|               7719|               7719|
|   mean|0.12838450576499547|0.05091333074232413|
| stddev|0.33453912068482916| 0.2198349934271054|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [139]:
total_count = results.count()

In [137]:
total_count # Total predictions by model

7719

In [120]:
Churned_count = results.filter(results.prediction == 1.0 ).count()

In [121]:
Churned_count

393

In [122]:
Churn_percentage = (Churned_count / total_count)*100

In [123]:
Churn_percentage

5.091333074232414

In [125]:
print(f"Percentage of people predicted to churn : {Churn_percentage : .2f}%")

Percentage of people predicted to churn :  5.09%


Note ----
1. Percentage of people predicted to churn is 5.09%
2. Accuracy of the model is around 85.64%