In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/28 19:59:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/28 19:59:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Logistic Regression with pyspark

## Import data

In [3]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day', 'month', 'poutcome').show(5)

                                                                                

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



## Deal with categorical data and Convert the data to dense vector

In [5]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
num_cols = ['balance', 'duration', 'campaign', 'pdays', 'previous']
labelCol = 'y'

## Process categorical columns
The following code does three things with pipeline:
* **`StringIndexer`** all categorical columns
* **`OneHotEncoder`** all categorical index columns
* **`VectorAssembler`** all feature columns into one vector column

## Categorical columns

In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [7]:
# categorical columns
categorical_columns = cat_cols

In [8]:
# convert categorical columns into numerical indices
indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns]

In [9]:
# convert indexed categorical features into one-hot encoded vectors
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), \
                            outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

In [10]:
# combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols = [encoder.getOutputCol() for encoder in encoders] + num_cols, \
                            outputCol = "features")

In [11]:
pipeline = Pipeline(stages = indexers + encoders + [assembler])
model = pipeline.fit(df)                               # train the pipeline on the dataset df
data = model.transform(df)                             # apply the pipeline to the df and tranforming
data = data.withColumn("label", col(labelCol))         # create label column
data = data.select("features", "label")
data.show(5, truncate=False)

25/02/28 19:59:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,-1.0])                |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,-1.0])               |no   |
|(29,[1,11,13,16,17,18,20,21,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,226.0,1.0,-1.0])                  |no   |
+---------------------------------------------------------------------------------------

## We need to deal with label, which is string, yes or no, need to make them numbers

## Build StringIndexer stages

In [12]:
# index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(data)
data = labelIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[8,11,15,16,1...|   no|         0.0|
|(29,[4,11,13,16,1...|   no|         0.0|
|(29,[0,12,14,16,1...|   no|         0.0|
|(29,[0,11,14,16,1...|   no|         0.0|
|(29,[1,11,13,16,1...|   no|         0.0|
+--------------------+-----+------------+
only showing top 5 rows



In [13]:
from pyspark.ml.feature import VectorIndexer
# automatically identify categorical features, and index them.
# set maxCategories so features with >4 distinct values are treated as continuous.
# update metadata accordingly.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", \
                               maxCategories=4).fit(data)

In [14]:
data = featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|label|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



In [15]:
## split the data to training and test data sets

In [16]:
# split the data into training and test sets(40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5, False)
testData.show(5, False)

+-------------------------------------------------------------------------------------------------+-----+------------+-------------------------------------------------------------------------------------------------+
|features                                                                                         |label|indexedLabel|indexedFeatures                                                                                  |
+-------------------------------------------------------------------------------------------------+-----+------------+-------------------------------------------------------------------------------------------------+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0]) |no   |0.0         |(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0]) |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.0,104.0,3.0,-1.0])  |no   |0.0         |(29,[0,11,13,1

## Build cross-validation model

### Estimator

In [17]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol="indexedFeatures", labelCol="indexedLabel")

### Pipeline Architecture

In [18]:
# convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", \
                                labels=labelIndexer.labels)

In [19]:
# chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[logr, labelConverter])

In [20]:
# train model, also runs the indexers
model = pipeline.fit(trainingData)

25/02/28 19:59:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


### Make predictions

In [21]:
# make predictions.
predictions = model.transform(testData)
# select example rows to display
predictions.select("features", "label", "predictedLabel", "rawPrediction").show(5)
# predictions.show(5, False)

+--------------------+-----+--------------+--------------------+
|            features|label|predictedLabel|       rawPrediction|
+--------------------+-----+--------------+--------------------+
|(29,[0,11,13,16,1...|   no|            no|[3.64520239761018...|
|(29,[0,11,13,16,1...|   no|            no|[0.66304084695874...|
|(29,[0,11,13,16,1...|   no|            no|[3.15871192900358...|
|(29,[0,11,13,16,1...|   no|            no|[3.64482052432358...|
|(29,[0,11,13,16,1...|   no|            no|[2.75409657899604...|
+--------------------+-----+--------------+--------------------+
only showing top 5 rows



### Evaluation

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.8987271721084671
Test Error = 0.101273


### Evaluate training model
- Area under ROC https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
- Accuracy
- False positive rate by label
- True positive rate by label
- Precision by label
- Recall by label
- F-measure by label

In [23]:
lrModel = model.stages[0]
trainingSummary = lrModel.summary

In [24]:
# obtain the objective per iteration
# objectiveHistory = trainingSummary.objectiveHistory
# print("objectiveHistory:")
# for objectie in objectiveHistory:
#     print(objective)
# obtain the receiver-operating characteristic as a dataframe and areaUnderROC
trainingSummary.roc.show(5)
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))
print(f"accuracy: {str(trainingSummary.accuracy)}")

                                                                                

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|4.151100041511000...|0.003278688524590164|
|8.302200083022001E-4|0.006557377049180328|
|8.302200083022001E-4|0.013114754098360656|
|8.302200083022001E-4|0.019672131147540985|
+--------------------+--------------------+
only showing top 5 rows

areaUnderROC: 0.8844054740079826
accuracy: 0.9034635224760501


In [25]:
predictions.show()

+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+--------------+
|            features|label|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+--------------+
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.64520239761018...|[0.97454856923068...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[0.66304084695874...|[0.65994314136597...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.15871192900358...|[0.95925062697289...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.64482052432358...|[0.97453909566059...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[2.75409657899604...|

In [26]:
# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i in range(len(trainingSummary.falsePositiveRateByLabel)):
    print("label {}:{}".format(i, trainingSummary.falsePositiveRateByLabel[i]))

False positive rate by label:
label 0:0.6918032786885245
label 1:0.021170610211706103


In [27]:
print("True positive rate by label:")
for i in range(len(trainingSummary.truePositiveRateByLabel)):
    print("label {}:{}".format(i, trainingSummary.truePositiveRateByLabel[i]))

True positive rate by label:
label 0:0.9788293897882939
label 1:0.3081967213114754


In [28]:
print("Precision by label")
for i in range(len(trainingSummary.precisionByLabel)):
    print("label {}:{}".format(i, trainingSummary.precisionByLabel[i]))

Precision by label
label 0:0.917866874270144
label 1:0.6482758620689655


In [29]:
print("Recall by label")
for i in range(len(trainingSummary.recallByLabel)):
    print("label {}:{}".format(i, trainingSummary.recallByLabel[i]))

Recall by label
label 0:0.9788293897882939
label 1:0.3081967213114754


In [30]:
print("F-measure by label")
for i in range(len(trainingSummary.fMeasureByLabel())):
    print("label {}:{}".format(i, trainingSummary.fMeasureByLabel()[i]))

F-measure by label
label 0:0.9473684210526316
label 1:0.4177777777777778


In [31]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: {0}\nFPR: {1}\nTPR: {2}\nF-measure: {3}\nPrecision: {4}\nRecall: {5}".format\
      (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.9034635224760501
FPR: 0.6164374113762807
TPR: 0.9034635224760501
F-measure: 0.8878528918710434
Precision: 0.8875701687722224
Recall: 0.9034635224760501


### Here is the slope W amd intercept b of the line, z = w * x + b

In [32]:
lrModel.coefficients

DenseVector([-0.5862, -0.9179, -0.5628, -0.3719, -0.6432, 0.2584, 0.0449, -0.8041, -1.2865, -0.3527, 0.2347, -0.5099, -0.3418, 0.4983, 0.7957, 0.5328, -0.7769, -0.5416, 0.9179, 0.3879, -0.7224, -2.668, -2.4764, -1.7132, 0.0, 0.0039, -0.0642, -0.0002, -0.0158])

In [33]:
lrModel.intercept

-0.46827825766787523