In [1]:
sc

In [2]:
spark

#### 1.Read the dataset

In [5]:
churn_data  = spark.read.csv("file:///home/hadoop/Downloads/Telco_Customer_Churn.csv",
                            header = True, inferSchema = True)
churn_data.count()

7043

#### 2.Data Exploration
    a. How many customer records are present in dataset?

In [6]:
churn_data.count()

7043

In [7]:
len(churn_data.columns)

21

In [8]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

    b. What is the distribution of Gender among customers?

In [9]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [10]:
churn_data.groupBy(['gender']).count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



    c. What is the Distribution of contract types among Customers?

In [11]:
churn_data.groupBy(['Contract']).count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



    d. What is the percentage of customers who churned ? 

In [13]:
churn_data.select(['Churn']).where("Churn = 'Yes'").count() / churn_data.count() * 100

26.536987079369588

#### 3. Data Preprocessing
    * Check for Missing values and handle missing data

In [14]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [16]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [18]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges') == " ", None)\
                     .otherwise(col("TotalCharges")))

In [19]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [20]:
churn_data1 = churn_data.na.drop()

In [23]:
from pyspark.sql.types import FloatType
churn_data1 = churn_data1.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))

#### 4.import Mllib
    f. Convert categorical variables into numerical format using one-hot encoding or label encoding

In [43]:
churn_data1= churn_data1.drop(col('customerID'))

In [44]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

In [45]:
print(churn_data1.columns)

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [27]:
#StringIndexer converts string values to indices.
#OneHotEncoder will convert categories into column and rows and for each of them one value will only be 1.

In [46]:
categorical_cols = [field.name for field in churn_data1.schema.fields if isinstance(field.dataType, StringType)]
print(categorical_cols)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [47]:
stages = []

for catcols in categorical_cols[:-1]:
    stringindexer = StringIndexer(inputCol = catcols, outputCol = catcols + "Index")
    onehotencoder = OneHotEncoderEstimator(inputCols = [stringindexer.getOutputCol()],
                                          outputCols = [catcols+"classVec"])
    stages += [stringindexer, onehotencoder]

In [48]:
numericalCols =  [field.name for field in churn_data1.schema.fields if not  isinstance(field.dataType, StringType)]
print(numericalCols)

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [49]:
assemblerInputs = [c + 'classVec' for c in categorical_cols[:-1]] + numericalCols
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol = "features")
stages += [assembler]

In [50]:
label_stringIdx = StringIndexer(inputCol = "Churn", outputCol = "label")

In [51]:
stages += [label_stringIdx]

In [52]:
pipeline = Pipeline(stages = stages)

In [53]:
preprocessing = pipeline.fit(churn_data1)

In [55]:
churn_df = preprocessing.transform(churn_data1)

In [58]:
churn_df.select(['Contract','ContractIndex','ContractclassVec']).show()

+--------------+-------------+----------------+
|      Contract|ContractIndex|ContractclassVec|
+--------------+-------------+----------------+
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|      Two year|          1.0|   (2,[1],

In [61]:
churn_df1 = churn_df.select(['features','label'])
churn_df1.show(truncate = False)

+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                        |label|
+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[2,7,8,11,12,14,16,18,20,22,23,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                        |0.0  |
|(30,[0,1,2,3,4,7,9,10,13,14,16,18,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])                         |0.0  |
|(30,[0,1,2,3,4,7,9,11,12,14,16,18,20,22,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,53.85,108.1500015258789]) |1.0  |
|(30,[0,1,2,7,9,10,13,15,16,18,25,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1

#### Split dataset into train and test

In [62]:
train, test  = churn_df1.randomSplit([0.8, 0.2], seed = 2)

In [63]:
train.select(['features', 'label'])

DataFrame[features: vector, label: double]

#### Build Decision Tree ML model

In [64]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol='features', labelCol = 'label')
decision_model = tree.fit(train)

#### Evaluate the Model

In [66]:
predictions =  decision_model.transform(test)

In [67]:
predictions.select(['label','probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|0.0  |[0.350076103500761,0.649923896499239]   |1.0       |
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|0.0  |[0.350076103500761,0.649923896499239]   |1.0       |
|0.0  |[0.350076103500761,0.649923896499239]   |1.0       |
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|0.0  |[0.350076103500761,0.649923896499239]   |1.0       |
|1.0  |[0.6071019473081328,0.3928980526918671] |0.0       |
|0.0  |[0.6071019473081328,0.3928980526918671] |0.0       |
|1.0  |[0.13440860215053763,0.8655913978494624]|1.0       |
|1.0  |[0.350076103500761,0.649923896499239]   |1.0       |
|1.0  |[0.13440860215053763,0.8655913978

In [69]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction',
                                             metricName = 'accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.7926093514328808

#### Build Random Forest Model

In [71]:
from pyspark.ml.classification import RandomForestClassifier
randomForest = RandomForestClassifier(featuresCol = 'features',labelCol = 'label')
rf_model= randomForest.fit(train)

In [72]:
predictions =  rf_model.transform(test)

In [74]:
predictions.select(['label','probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.3176730601188934,0.6823269398811065] |1.0       |
|1.0  |[0.3176730601188934,0.6823269398811065] |1.0       |
|0.0  |[0.4137422580434661,0.5862577419565339] |1.0       |
|1.0  |[0.3373973382031844,0.6626026617968155] |1.0       |
|0.0  |[0.3760353877876359,0.6239646122123641] |1.0       |
|0.0  |[0.43723714074113973,0.5627628592588602]|1.0       |
|1.0  |[0.33893153812603594,0.6610684618739641]|1.0       |
|1.0  |[0.33893153812603594,0.6610684618739641]|1.0       |
|0.0  |[0.4827059064806817,0.5172940935193183] |1.0       |
|1.0  |[0.6562348165530641,0.34376518344693585]|0.0       |
|0.0  |[0.6749693449354799,0.32503065506452017]|0.0       |
|1.0  |[0.30600459653339485,0.6939954034666052]|1.0       |
|1.0  |[0.3633866573953637,0.6366133426046362] |1.0       |
|1.0  |[0.32726307454053727,0.6727369254

In [75]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction',
                                             metricName = 'accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.7941176470588235

#### Logistic Regression ML Model

In [73]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol = 'features',labelCol = 'label')
logit_model= logistic.fit(train)

In [76]:
predictions =  logit_model.transform(test)

In [77]:
predictions.select(['label','probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.29707083712442606,0.7029291628755739]|1.0       |
|1.0  |[0.299840507280267,0.700159492719733]   |1.0       |
|0.0  |[0.3298114042006738,0.6701885957993262] |1.0       |
|1.0  |[0.4022680279633068,0.5977319720366933] |1.0       |
|0.0  |[0.4067777874729466,0.5932222125270534] |1.0       |
|0.0  |[0.40622995954400115,0.5937700404559989]|1.0       |
|1.0  |[0.37565761124560076,0.6243423887543992]|1.0       |
|1.0  |[0.38125841120801496,0.618741588791985] |1.0       |
|0.0  |[0.5455246981098829,0.45447530189011703]|0.0       |
|1.0  |[0.625230253385939,0.3747697466140611]  |0.0       |
|0.0  |[0.8168379835185562,0.18316201648144387]|0.0       |
|1.0  |[0.25966028549346903,0.7403397145065309]|1.0       |
|1.0  |[0.3360738330216257,0.6639261669783743] |1.0       |
|1.0  |[0.34609089878034405,0.6539091012

In [78]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction',
                                             metricName = 'accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.8144796380090498