In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("mylogistic").getOrCreate()

In [3]:
my_data = spark.read.csv('/FileStore/tables/customer_churn.csv', header = True, inferSchema = True)

In [4]:
my_data.printSchema()

In [5]:
my_data.show()

In [6]:
my_data.columns

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'],
                           outputCol = 'features')

In [9]:
output = assembler.transform(my_data)

In [10]:
final_data = output.select('features', 'churn')

In [11]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [12]:
from pyspark.ml.classification import LogisticRegression

In [13]:
lr_churn = LogisticRegression(labelCol = 'churn')

In [14]:
fit_lr_churn = lr_churn.fit(train_data)

In [15]:
training_sum = fit_lr_churn.summary

In [16]:
training_sum.predictions.describe().show()

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [18]:
pred_and_labels = fit_lr_churn.evaluate(test_data)

In [19]:
pred_and_labels.predictions.show()

In [20]:
eval_churn = BinaryClassificationEvaluator(labelCol= 'churn',
                                          rawPredictionCol = 'prediction')

In [21]:
auc = eval_churn.evaluate(pred_and_labels.predictions)
auc

In [22]:
#predict on new data
final_lr_model = lr_churn.fit(final_data)

In [23]:
new_data = spark.read.csv('/FileStore/tables/new_customers.csv', header = True,
                         inferSchema = True)

In [24]:
test_new_data = assembler.transform(new_data)

In [25]:
new_results = final_lr_model.transform(test_new_data)

In [26]:
new_results.show()

In [27]:
new_results.select('Company','prediction').show()