In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [3]:
df = spark.sql('select * from customer_churn')

In [4]:
df.show(5)

In [5]:
df.printSchema()

In [6]:
df2 = df.select(['Age','Total_Purchase','Years','Num_Sites','Churn'])

In [7]:
df2.groupBy('Churn').count().show()

In [8]:
df2.show(5)

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'],outputCol='features')
output = assembler.transform(df2)

In [11]:
output.show(5)

In [12]:
final_data = output.select('features','Churn')

In [13]:
train,test = final_data.randomSplit([0.7,0.3])

In [14]:
from pyspark.ml.classification import LogisticRegression

In [15]:
log = LogisticRegression(featuresCol='features',labelCol='Churn',predictionCol='prediction')

In [16]:
log_model = log.fit(train)

In [17]:
log_model_summary =log_model.summary

In [18]:
print(log_model_summary.falsePositiveRateByLabel)
print(log_model_summary.recallByLabel)
print(log_model_summary.areaUnderROC)
print(log_model_summary.accuracy)
print(log_model_summary.precisionByLabel)

In [19]:
pred = log_model.evaluate(test)

In [20]:
print(pred.falsePositiveRateByLabel)
print(pred.recallByLabel)
print(pred.areaUnderROC)
print(pred.accuracy)
print(pred.precisionByLabel)
print(pred.predictions.show())

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [22]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [23]:
auc = evaluator.evaluate(pred.predictions)
auc

In [24]:
# prediction on new data
new_data = spark.sql('select * from new_customers')

In [25]:
# fitting data on whole data(previous data)
log = LogisticRegression(labelCol='Churn')
final_model = log.fit(final_data)

In [26]:
# assembler is taking the selected variables from new_data as defined by assembler object, not all the varibales from new_data
new_data2 = assembler.transform(new_data) 

In [27]:
final_results = final_model.transform(new_data2)

In [28]:
final_results.show()