The following command adds the pyspark to sys.path at runtime. If the pyspark is not on the system path by default. It also prints the path of the spark.

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark

Create a Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Hyperparameter") \
    .master('local[3]') \
    .getOrCreate()

Read the dataset into a dataframe.

In [None]:
customer_churn = spark.read.csv('customer_churn_exercise.csv', inferSchema=True, header=True, mode='DROPMALFORMED')

About the Dataset
This is the data of the marketing agency which has altogether 8 features and 1 target variable.

Name: Name of the company whom the customer is tagged to
Age: Age of the Customer
Total_Purchase: Total Ads Purchased
Account_Manager: Binary 0=No manager, 1= Account manager assigned
Years: Total Years of customers using the company service
Num_sites: Total number of websites that are using this service.
Onboard_date: Onboarding date of the latest contacted person.
Location: Head Quarter address of the client
Company: Name of Client’s Company

In [None]:
customer_churn.show(1, truncate=False ,vertical=True)

In [None]:
customer_churn.printSchema()

Display the data type of the coulmns.

In [None]:
display(customer_churn.summary())

Dropping rows with NaN values

In [None]:
print("rows: {}".format(customer_churn.count()))
customer_churn = customer_churn.dropna()
print("rows after dropna",format(customer_churn.count()))

Import the pyspark modules required for pre-processing the data.

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')

In [None]:
output = assembler.transform(customer_churn)

In [None]:
final_data = output.select('features','churn')

In [None]:
train_churn,test_churn = final_data.randomSplit([0.7,0.3])

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr_churn = LogisticRegression(labelCol='churn')

In [None]:
fitted_churn_model = lr_churn.fit(train_churn)

In [None]:
training_sum = fitted_churn_model.summary

In [None]:
training_sum.predictions.describe().show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [None]:
pred_and_labels.predictions.show()

In [None]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn')

In [None]:
auc = churn_eval.evaluate(pred_and_labels.predictions)
print(auc)

In [None]:
spark.stop()