In [3]:
#Using this library to pinpoint and use pyspark from a different path directory
import findspark
findspark.init("/Users/afrochemist/Desktop/spark-2.3.0-bin-hadoop2.7")

In [4]:
#Importing the pyspark library
import pyspark

In [5]:
#Spark Library for Logistic Regression
from pyspark.ml.classification import LogisticRegression
#Spark Library for evaluating the result after the model has been made
from pyspark.ml.evaluation import BinaryClassificationEvaluator
#Spark Library vectorizing the data
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
#Initiating the spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logregchurn').getOrCreate()

In [8]:
data = spark.read.csv('/Users/afrochemist/desktop/datasets/customer_churn.csv',inferSchema=True,header=True)

In [9]:
#Looking at the Schema for this dataset
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [13]:
#Looking at the first few rows
data.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [14]:
#Looking at the columns
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [17]:
#The setup
assembler = VectorAssembler(inputCols=['Age',
                                      'Total_Purchase',
                                      'Account_Manager',
                                      'Years',
                                      'Num_Sites'],outputCol='features')

In [18]:
output = assembler.transform(data)

In [19]:
#This will focus on the features and its churn rates
final_data = output.select('features','churn')

In [20]:
#Test/Train split
train_churn,test_churn = final_data.randomSplit([0.7,0.3])

In [21]:
#Setting up the model
lr_churn = LogisticRegression(labelCol = 'churn')

In [22]:
#Fitting the model
fitted_churn_model = lr_churn.fit(train_churn)

In [24]:
training_sum = fitted_churn_model.summary

In [32]:
#Setting up the predictions
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [33]:
#Looking at the first 5 predictions
# 1 means they will churn
# 0 means they will not churn
pred_and_labels.predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[25.0,9672.03,0.0...|    0|[4.57626030221560...|[0.98981155460358...|       0.0|
|[26.0,8787.39,1.0...|    1|[0.51465721220140...|[0.62589759753836...|       0.0|
|[26.0,8939.61,0.0...|    0|[6.47407379710803...|[0.99845944961778...|       0.0|
|[28.0,8670.98,0.0...|    0|[8.02499964256602...|[0.99967292686878...|       0.0|
|[29.0,8688.17,1.0...|    1|[2.65213433860002...|[0.93414241752189...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [36]:
#Making predictions with unlabeled data
final_lr_model = lr_churn.fit(final_data)

In [37]:
new_customers = spark.read.csv('/Users/afrochemist/Desktop/datasets/new_customers.csv',inferSchema=True,
                                                                                      header=True
                              )

In [38]:
#Looking at the schema of the new data
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [40]:
test_new_customers = assembler.transform(new_customers)

In [42]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [43]:
final_results = final_lr_model.transform(test_new_customers)

In [44]:
#Looking at the first 5 predictions with the new data
final_results.select('Company','prediction').show(5)

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
+----------------+----------+
only showing top 5 rows

