# Data Loading

In [2]:
# File location and type
file_location = "/FileStore/tables/TelcoCustomerChurn.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("nanValue", " ") \
  .option("nullValue", " ") \
  .load(file_location)

display(df)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [3]:
# Saving as a temporary table

df.createOrReplaceTempView("churn_analysis")

# Data Analysis

In [5]:
%sql

select * from churn_analysis

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [6]:
df.printSchema()

Checking for Null

In [8]:
from pyspark.sql.functions import isnan, when, count, col

In [9]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [10]:
df.groupBy("Churn").count().show()

In [11]:
df.select("tenure", "TotalCharges", "MonthlyCharges").describe().show()

Checking if Gender has any affect on Churn

In [13]:
display(df.groupBy("gender", "churn").count())

gender,churn,count
Male,No,2625
Male,Yes,930
Female,No,2549
Female,Yes,939


Since this doesn't give much insight. we will take use of plots. We can make our own plots but DataBricks provides inbuilt support for plots. So we will use that.

In [15]:
display(df.groupBy("gender", "churn").count())

gender,churn,count
Male,No,2625
Male,Yes,930
Female,No,2549
Female,Yes,939


So as you can see. There is no significant effect of Gender on churn.

We will next see the same with seniorCitizens.

In [17]:
display(df.groupBy("SeniorCitizen", "churn"). count())

SeniorCitizen,churn,count
1,No,666
0,No,4508
0,Yes,1393
1,Yes,476


This plot shows that there is significant affect of SeniorCitizen column on the churn.
Eg., when the person is not SeniorCitizen, they are more likely to not Churn.

Now we will see the same for "tenure" column

In [19]:
display(df.groupBy("tenure", "churn").count().orderBy("tenure"))

tenure,churn,count
0,No,11
1,No,233
1,Yes,380
2,No,115
2,Yes,123
3,No,106
3,Yes,94
4,No,93
4,Yes,83
5,No,69


* As the tenure increases, number of churned customers decreases
* tenure has less affect on Customers not churning as on both sides we have big towers where as in between the heights of Bars are random and of almost same height
* This means people who stays will stay no matter what the tenure is. Whereas, people who leave tend to leave in initial time. A reason can be good offers for news customers.

Similarly we will see such behaviours with other variables too.

In [21]:
display(df.stat.crosstab("seniorcitizen", "internetservice"))

seniorcitizen_internetservice,DSL,Fiber optic,No
1,259,831,52
0,2162,2265,1474


In [22]:
display(df.groupBy("paperlessBilling", "Churn").count())

paperlessBilling,Churn,count
Yes,Yes,1400
No,No,2403
Yes,No,2771
No,Yes,469


In [23]:
display(df.groupBy("paymentMethod", "churn").count())

paymentMethod,churn,count
Credit card (automatic),No,1290
Bank transfer (automatic),No,1286
Mailed check,Yes,308
Credit card (automatic),Yes,232
Electronic check,No,1294
Electronic check,Yes,1071
Bank transfer (automatic),Yes,258
Mailed check,No,1304


# Model Building

In [25]:
# 0.7 and 0.3 are train and test percent resepctively
# 24 is random seed to reproduce results
train_data, test_data = df.randomSplit([0.7, 0.4], 24)

In [26]:
print("full data: ", df.count())
print("train: ", train_data.count())
print("test: ", test_data.count())

In [27]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [28]:
catColumns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
              'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Handling Categorical value

Click [here](https://spark.apache.org/docs/latest/ml-features#stringindexer) to know the working of StringIndexer.<br>
Click [here](https://spark.apache.org/docs/latest/ml-features#onehotencoderestimator) to know about OneHotEncoderEstimator.

In [31]:
stages = []

for catCol in catColumns:
  
  stringIndexer = StringIndexer(inputCol=catCol, outputCol=catCol + "Index")
  
  # It takes output of above stringIndexer as input column
  encoder = OneHotEncoderEstimator(inputCols = [stringIndexer.getOutputCol()], outputCols=[catCol + "catVec"])
  
  stages += [stringIndexer, encoder]

In [32]:
print(len(catColumns))
print(len(stages))

In [33]:
stages[:6]

In [34]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=["TotalCharges"], outputCols=["out_TotalCharges"])
stages.append(imputer)

In [35]:
stages[-1]

In [36]:
# label is default name of target column. So if we name a column as "label"
# it will be treated as target column
label_indx = StringIndexer(inputCol="Churn", outputCol="label")
stages.append(label_indx)

In [37]:
# This step is just for showing how its done and not necessary here. Look at last two columns
# You will see transformation is applied
temp = label_indx.fit(train_data).transform(train_data)

In [38]:
display(temp)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,label
0004-TLHLJ,Male,0,No,No,4,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,Yes,1.0
0013-EXCHZ,Female,1,Yes,No,3,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,Yes,1.0
0014-BMAQU,Male,0,Yes,No,63,Yes,Yes,Fiber optic,Yes,No,No,Yes,No,No,Two year,Yes,Credit card (automatic),84.65,5377.8,No,0.0
0015-UOCOJ,Female,1,No,No,7,Yes,No,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,48.2,340.35,No,0.0
0016-QLJIS,Female,0,Yes,Yes,65,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Mailed check,90.45,5957.9,No,0.0
0017-DINOC,Male,0,No,No,54,No,No phone service,DSL,Yes,No,No,Yes,Yes,No,Two year,No,Credit card (automatic),45.2,2460.55,No,0.0
0018-NYROU,Female,0,Yes,No,5,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,68.95,351.5,No,0.0
0020-INWCK,Female,0,Yes,Yes,71,Yes,Yes,Fiber optic,No,Yes,Yes,No,No,Yes,Two year,Yes,Credit card (automatic),95.75,6849.4,No,0.0
0021-IKXGC,Female,1,No,No,1,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,72.1,72.1,No,0.0
0022-TCJCI,Male,1,No,No,45,Yes,No,DSL,Yes,No,Yes,No,No,Yes,One year,No,Credit card (automatic),62.7,2791.5,Yes,1.0


In [39]:
df.stat.corr("TotalCharges", "MonthlyCharges")

In [40]:
display(df.groupBy("tenure", "churn").count().filter(col("churn") == "Yes").orderBy("tenure"))

tenure,churn,count
1,Yes,380
2,Yes,123
3,Yes,94
4,Yes,83
5,Yes,64
6,Yes,40
7,Yes,51
8,Yes,42
9,Yes,46
10,Yes,45


making bins of "tenure" column. this is because as you go down the data doesn't vary much. It remains in around average

In [42]:
from pyspark.ml.feature import QuantileDiscretizer

In [43]:
tenure_bin = QuantileDiscretizer(numBuckets=3, inputCol="tenure", outputCol="tenure_bin")
stages.append(tenure_bin)

Now we will create a vecorAssembler. this assembler takes all the values and create a numpy kind of array. And that is going to be the input to the model.

In [45]:
numericCols = ["tenure_bin", "out_TotalCharges", "MonthlyCharges"]

# Above while transforming categorical features, we added a "catVec" extension to
# that column name. So we are putting all the column names (modified categorical and numerical)
# into an assembler
assembleInputs = assemblerInputs = [c + "catVec" for c in catColumns] + numericCols

# "feature" is default name for all the features. We will not do anything with it. 
# We will keep that as it is
assembler = VectorAssembler(inputCols=assembleInputs, outputCol="features")
stages.append(assembler)

In [46]:
assembler

In [47]:
# Here you are creating a pipeline with all the stages
# defined earlier. We are fitting on train_data
pipeline = Pipeline().setStages(stages)
pipelineModel = pipeline.fit(train_data)

In [48]:
# Here we are transforming both test and train
# data using the same pipeline
trainprepDF = pipelineModel.transform(train_data) 
testprepDF = pipelineModel.transform(test_data)

In [49]:
display(trainprepDF)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,genderIndex,gendercatVec,SeniorCitizenIndex,SeniorCitizencatVec,PartnerIndex,PartnercatVec,DependentsIndex,DependentscatVec,PhoneServiceIndex,PhoneServicecatVec,MultipleLinesIndex,MultipleLinescatVec,InternetServiceIndex,InternetServicecatVec,OnlineSecurityIndex,OnlineSecuritycatVec,OnlineBackupIndex,OnlineBackupcatVec,DeviceProtectionIndex,DeviceProtectioncatVec,TechSupportIndex,TechSupportcatVec,StreamingTVIndex,StreamingTVcatVec,StreamingMoviesIndex,StreamingMoviescatVec,ContractIndex,ContractcatVec,PaperlessBillingIndex,PaperlessBillingcatVec,PaymentMethodIndex,PaymentMethodcatVec,out_TotalCharges,label,tenure_bin,features
0004-TLHLJ,Male,0,No,No,4,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,Yes,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",280.85,1.0,0.0,"List(0, 30, List(1, 2, 3, 4, 5, 7, 9, 11, 14, 15, 17, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 280.85, 73.9))"
0013-EXCHZ,Female,1,Yes,No,3,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,Yes,0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",267.4,1.0,0.0,"List(0, 30, List(0, 3, 4, 5, 7, 9, 11, 13, 16, 18, 20, 21, 23, 25, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 267.4, 83.9))"
0014-BMAQU,Male,0,Yes,No,63,Yes,Yes,Fiber optic,Yes,No,No,Yes,No,No,Two year,Yes,Credit card (automatic),84.65,5377.8,No,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",5377.8,0.0,2.0,"List(0, 30, List(1, 3, 4, 6, 7, 10, 11, 13, 16, 17, 20, 22, 23, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 5377.8, 84.65))"
0015-UOCOJ,Female,1,No,No,7,Yes,No,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,48.2,340.35,No,0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",340.35,0.0,0.0,"List(0, 30, List(0, 2, 3, 4, 5, 8, 10, 11, 13, 15, 17, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 340.35, 48.2))"
0016-QLJIS,Female,0,Yes,Yes,65,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Mailed check,90.45,5957.9,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",5957.9,0.0,2.0,"List(0, 30, List(0, 1, 4, 6, 8, 10, 12, 14, 16, 18, 19, 22, 23, 25, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 5957.9, 90.45))"
0017-DINOC,Male,0,No,No,54,No,No phone service,DSL,Yes,No,No,Yes,Yes,No,Two year,No,Credit card (automatic),45.2,2460.55,No,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 3, List(2), List(1.0))",2460.55,0.0,2.0,"List(0, 30, List(1, 2, 3, 8, 10, 11, 13, 16, 18, 20, 22, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2460.55, 45.2))"
0018-NYROU,Female,0,Yes,No,5,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,68.95,351.5,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",351.5,0.0,0.0,"List(0, 30, List(0, 1, 3, 4, 5, 7, 9, 11, 13, 15, 17, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 351.5, 68.95))"
0020-INWCK,Female,0,Yes,Yes,71,Yes,Yes,Fiber optic,No,Yes,Yes,No,No,Yes,Two year,Yes,Credit card (automatic),95.75,6849.4,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",6849.4,0.0,2.0,"List(0, 30, List(0, 1, 4, 6, 7, 9, 12, 14, 15, 17, 19, 22, 23, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 6849.4, 95.75))"
0021-IKXGC,Female,1,No,No,1,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,72.1,72.1,No,0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",72.1,0.0,0.0,"List(0, 30, List(0, 2, 3, 4, 6, 7, 9, 11, 13, 15, 17, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 72.1, 72.1))"
0022-TCJCI,Male,1,No,No,45,Yes,No,DSL,Yes,No,Yes,No,No,Yes,One year,No,Credit card (automatic),62.7,2791.5,Yes,1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",2.0,"List(0, 2, List(), List())",1.0,"List(0, 1, List(), List())",2.0,"List(0, 3, List(2), List(1.0))",2791.5,1.0,1.0,"List(0, 30, List(2, 3, 4, 5, 8, 10, 11, 14, 15, 17, 19, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2791.5, 62.7))"


In [50]:
# Here for onehotencoding sparse vectors are used Where only essential information is shown
trainprepDF.head(1)

Look at the features SparseVector at the end. It says there are 30 features. Wherever there is 0 it skips that feature to decrease memory utilization. wherever there is value, it keeps that as it is.

In [52]:
trainprepDF.select("tenure_bin").show(10)

We have kept the model (estimator) outside of pipeline, but we can keep that too inside

In [54]:
from pyspark.ml.classification import LogisticRegression

In [55]:
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
lr_model = lr.fit(trainprepDF)

In [56]:
lr_model.coefficients

In [57]:
lr_model.intercept

In [58]:
summary = lr_model.summary

In [59]:
accuracy = summary.accuracy
falsePositiveRate = summary.weightedFalsePositiveRate
truePositiveRate = summary.weightedTruePositiveRate
fmeasure = summary.weightedFMeasure()
precision = summary.weightedPrecision
recall = summary.weightedRecall
areaUnderROC = summary.areaUnderROC

In [60]:
print(f"accuracy: {accuracy}")
print(f"falsePositiveRate : {falsePositiveRate}")
print(f"truePositiveRate : {truePositiveRate}")
print(f"fmeasure : {fmeasure}")
print(f"precision : {precision}")
print(f"recall : {recall}")
print(f"areaUnderROC : {areaUnderROC}")

In [61]:
# ROC curve
display(lr_model, trainprepDF, "ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.859182802963416
0.0,0.0384615384615384,0.859182802963416
0.0,0.0769230769230769,0.8194458162836044
0.0166666666666666,0.0769230769230769,0.8125221648632073
0.0333333333333333,0.0769230769230769,0.7960551743967972
0.0333333333333333,0.1153846153846153,0.7629990313951448
0.0333333333333333,0.1538461538461538,0.7470355110060449
0.0333333333333333,0.1923076923076923,0.7442911833052653
0.05,0.1923076923076923,0.71269917385917
0.05,0.2307692307692307,0.7063241937565392


In [62]:
display(lr_model, trainprepDF, "fittedVsResiduals")

fitted values,residuals
-3.3170982811620813,-0.0349892529901269
0.8736157850449944,0.2945024897435058
0.0909286424856857,-0.522716511067561
-1.1373967145525352,0.7572013520908487
0.2416415733950608,0.4398818505166803
-3.8338892845876886,-0.0211675883440982
-3.018160115846628,-0.0466121698296699
-4.28822698543631,-0.0135433064985229
1.512597358458875,0.1805541837163956
-2.1757360173293927,-0.1019506621079084


Now lets work on test data

In [64]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [65]:
predictions = lr_model.transform(testprepDF)
evaluatorLR = BinaryClassificationEvaluator(rawPredictionCol="prediction")
area_under_curve = evaluatorLR.evaluate(predictions)

# default evaluation is areaUnderROC
print(f"areaUnderROC: {areaUnderROC}")

print(evaluatorLR.getMetricName())

In [66]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [67]:
results = predictions.select(["prediction", "label"])

# # prepare score-lable set
results_collect = results.collect()
results_list = [(float(i[0]), float(i[1])) for i in results_collect]

predictionAndLabels = sc.parallelize(results_list)

metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print(f"Area under PR curve: {metrics.areaUnderPR}")

# Area under ROC curve
print(f"Area under ROC curve: {metrics.areaUnderROC}")


In [68]:
# since there is no inbuilt method for these metrics for
# binary classifier, we will calculate on our own
count = predictions.count()
correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()

In [69]:
tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp + tn) / count
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print("correct: ", correct)
print("wrong: ", wrong)
print("tp: ", tp)
print("fp: ", fp)
print("fn: ", fn)
print("tn: ", tn)
print("accuracy: ", accuracy)
print("precision: ", precision)
print("recall: ", recall)

Now we will do cross validation. For that we will do Grid search first

In [71]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [72]:
# create paramGrid for cross-validation
# Hre we are setting paramers for logistic regression
# like regularization, maximum iteration
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [5, 10, 20])
             .build()
            )

In [73]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluatorLR, numFolds=5)

# Run cross valiadtion
cvModel = cv.fit(trainprepDF)

In [74]:
predictions = cvModel.bestModel.transform(testprepDF)

In [75]:
evaluatorLR.evaluate(predictions)

In [76]:
results = predictions.select(["prediction", "label"])
count = predictions.count()

correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()

tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp + tn) / count
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print("correct: ", correct)
print("wrong: ", wrong)
print("tp: ", tp)
print("fp: ", fp)
print("fn: ", fn)
print("tn: ", tn)
print("accuracy: ", accuracy)
print("precision: ", precision)
print("recall: ", recall)

In [77]:
cvModel.explainParams()

In [78]:
from pyspark.ml.classification import RandomForestClassifier 

In [79]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features") \
  .setImpurity("gini") \
  .setMaxDepth(6) \
  .setNumTrees(50) \
  .setFeatureSubsetStrategy("auto") \
  .setSeed(1010)

rfModel = rf.fit(trainprepDF)

In [80]:
predictions = rfModel.transform(testprepDF)

In [81]:
results = predictions.select(["prediction", "label"])
count = predictions.count()

correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()

tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp + tn) / count
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print("correct: ", correct)
print("wrong: ", wrong)
print("tp: ", tp)
print("fp: ", fp)
print("fn: ", fn)
print("tn: ", tn)
print("accuracy: ", accuracy)
print("precision: ", precision)
print("recall: ", recall)