In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.master('local').appName('Churn_LogRegres').getOrCreate()

In [3]:
!ls

data
LICENSE
README.md
requirements.txt
Untitled.ipynb


In [4]:
df= spark.read.csv('data/Churn_Modelling.csv',inferSchema= True, header= True)

In [5]:
df.show(5)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [8]:
# drop the unwanted columns 
list= ['RowNumber', 'CustomerId','Surname']
df= df.drop(*list)

In [9]:
df.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|
|        850|    Spain|Female| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|
+-----------+---------+------+---+------+---------+-------------+---------+-------------

In [10]:
df.printSchema()

root
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [13]:
#lets change the string type in to Integee type using StringIndexer
indexer = StringIndexer(inputCol= 'Gender', outputCol= 'gender_cat')
indexed= indexer.fit(df).transform(df)

In [14]:
indexer = StringIndexer(inputCol= 'Geography', outputCol= 'geography_cat')
indexed= indexer.fit(indexed).transform(indexed)

In [15]:
indexed.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+----------+-------------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|gender_cat|geography_cat|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+----------+-------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|       1.0|          0.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|       1.0|          2.0|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|       1.0|          0.0|
|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|       1.0|          0.0|
|        850|    Spain|Fema

In [16]:
assembler = VectorAssembler(inputCols=['CreditScore','Age','Tenure','Balance','NumOfProducts',
                                       'HasCrCard','IsActiveMember','EstimatedSalary','gender_cat','geography_cat'],
                           outputCol= 'features')

In [17]:
output = assembler.transform(indexed)

In [18]:
churn_df_final= output.select('features', 'Exited')

In [19]:
churn_df_final.show(5, truncate= False)

+-------------------------------------------------------+------+
|features                                               |Exited|
+-------------------------------------------------------+------+
|[619.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0,0.0]     |1     |
|[608.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,1.0,2.0]|0     |
|[502.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0,0.0]|1     |
|[699.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,1.0,0.0]      |0     |
|[850.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,1.0,2.0] |0     |
+-------------------------------------------------------+------+
only showing top 5 rows



In [21]:
#Split the data in to train test splits 
train,test= churn_df_final.randomSplit([0.7,0.3], seed= 9)

In [23]:
print(train.count()), print(test.count())

7023
2977


(None, None)

In [24]:
#lets do the model building
model = LogisticRegression(labelCol= 'Exited')

In [25]:
#train the model
lr_model = model.fit(train)

In [26]:
lr_model #it showing that num of classes=2, num of features=10

LogisticRegressionModel: uid=LogisticRegression_83b75109c650, numClasses=2, numFeatures=10

In [27]:
#let evaluate the train predictions
train_eval = lr_model.evaluate(train).predictions

In [28]:
train_eval.show(5)

+--------------------+------+--------------------+--------------------+----------+
|            features|Exited|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(10,[0,1,2,4,7],[...|     0|[2.38702256441885...|[0.91583234103214...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[2.10568341180284...|[0.89145435033539...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[2.15658894637445...|[0.89628288577846...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[1.24609441996402...|[0.77662305444172...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[1.99858761274397...|[0.88064870660154...|       0.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [30]:
#lets print the Coefficients and Intercept 
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-0.0008282599182243841,0.07226529456955158,-0.018583090904753984,5.015519181836452e-06,-0.06582359615850605,-0.04937176785283148,-1.0787252259940088,8.07632868640747e-07,0.6005678843651715,0.07726249228574214]
Intercept: -3.927014289018742


In [78]:
test_eval= lr_model.evaluate(test).predictions

In [79]:
test_eval.show(5)

+--------------------+------+--------------------+--------------------+----------+
|            features|Exited|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(10,[0,1,2,4,7],[...|     0|[1.46087648530929...|[0.81166669432571...|       0.0|
|(10,[0,1,2,4,7],[...|     1|[2.22362963694228...|[0.90235148232728...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[2.56070064774316...|[0.92828911275894...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[1.49049439364358...|[0.81615246676426...|       0.0|
|(10,[0,1,2,4,7],[...|     0|[1.86406474074996...|[0.86577002357675...|       0.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [82]:
#lets calculate TP,TN and accuracy
TP= test_eval[(test_eval.Exited==1) & (test_eval.prediction==1)].count() 
TP

103

In [83]:
TN= test_eval[(test_eval.Exited==0) & (test_eval.prediction==0)].count()
TN

2284

In [88]:
Accuracy= float((TP+TN)/test_eval.count())

In [90]:
Accuracy

0.8018139066174