## Multinomial Logistic Regression

Use multinomial logistic regression to predict multiple classes based on features provided.

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/03 18:30:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# load training data
data = spark.read.format("libsvm").load("data/SparkData/sample_multiclass_classification_data.txt")

25/03/03 18:30:33 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [4]:
data.select('label').distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  1.0|(4,[0,2,3],[-0.83...|
|  2.0|(4,[0,1,2,3],[-1....|
|  2.0|(4,[0,1,2,3],[-1....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,2,3],[0.611...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,2,3],[-0.94...|
|  2.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  2.0|(4,[0,1,2,3],[-0....|
+-----+--------------------+
only showing top 20 rows



In [6]:
train_data, test_data = data.randomSplit([0.8, 0.2])

In [7]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, \
                        featuresCol='features', labelCol='label')

In [8]:
# fit the model
model = lr.fit(train_data)

25/03/03 18:30:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [9]:
# make predictions
predictions = model.transform(test_data)
# select example rows to display
predictions.show(5)
# predictions.show(5, False)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4,[0,1,2,3],[-0....|[0.17671197056610...|[0.41386611807366...|       0.0|
|  0.0|(4,[0,1,2,3],[0.0...|[0.12642328579967...|[0.39292057646508...|       2.0|
|  0.0|(4,[0,1,2,3],[0.0...|[0.25214529944908...|[0.44635661507787...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|[0.12642328579967...|[0.40898887447851...|       2.0|
|  0.0|(4,[0,1,2,3],[0.2...|[0.12642328579967...|[0.40710013779628...|       2.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [10]:
predictions.select('label', 'rawPrediction', 'probability', 'prediction').filter('label = 2.0').show(5, False)

+-----+---------------------------------------------------------------+------------------------------------------------------------+----------+
|label|rawPrediction                                                  |probability                                                 |prediction|
+-----+---------------------------------------------------------------+------------------------------------------------------------+----------+
|2.0  |[0.050989956916697915,-0.36966871193287,0.15870503324587906]   |[0.3609659300637143,0.237015350356587,0.4020187195796988]   |2.0       |
|2.0  |[-0.07473235846602755,-0.10868145817303719,0.15870503324587906]|[0.30964023440344984,0.2993046615709015,0.39105510402564864]|2.0       |
|2.0  |[0.02584540332015816,-0.3120415482160539,0.15870503324587906]  |[0.3502178507432775,0.24980195901951904,0.39998019023720355]|2.0       |
|2.0  |[0.12642328579967008,-0.542548199148243,0.15870503324587906]   |[0.39292057646508016,0.20126772983728808,0.4058116936976317]|2.0 

In [11]:
# print the coefficients and intercept for multinomial logistic regresssion
print("Coefficients: {}".format(model.coefficientMatrix))
print("Intercepts: {}".format(model.interceptVector))

Coefficients: 3 X 4 CSRMatrix
(0,3) 0.3017
(1,2) -0.8008
(1,3) -0.3658
Intercepts: [0.0007009704169446199,-0.15940600366282368,0.15870503324587906]


In [12]:
trainingSummary = model.summary

In [13]:
# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i in range(len(trainingSummary.falsePositiveRateByLabel)):
    print("label {}:{}".format(i, trainingSummary.falsePositiveRateByLabel[i]))

False positive rate by label:
label 0:0.0
label 1:0.011904761904761904
label 2:0.20253164556962025


In [15]:
print("True positive rate by label:")
for i in range(len(trainingSummary.truePositiveRateByLabel)):
    print("label {}: {}".format(i, trainingSummary.truePositiveRateByLabel[i]))

True positive rate by label:
label 0: 0.5897435897435898
label 1: 1.0
label 2: 0.9777777777777777


In [16]:
print("Precision by label:")
for i in range(len(trainingSummary.precisionByLabel)):
    print("label {}: {}".format(i, trainingSummary.precisionByLabel[i]))

Precision by label:
label 0: 1.0
label 1: 0.975609756097561
label 2: 0.7333333333333333


In [17]:
print("Recall by label:")
for i in range(len(trainingSummary.recallByLabel)):
    print("label {}: {}".format(i, trainingSummary.recallByLabel[i]))
# recall need improvement for label2(class 2)

Recall by label:
label 0: 0.5897435897435898
label 1: 1.0
label 2: 0.9777777777777777


In [18]:
print("F-measure by label:")
for i in range(len(trainingSummary.fMeasureByLabel())):
    print("label {}: {}".format(i, trainingSummary.fMeasureByLabel()[i]))

F-measure by label:
label 0: 0.7419354838709677
label 1: 0.9876543209876543
label 2: 0.838095238095238


In [19]:
accuracy = trainingSummary.accuracy

In [20]:
# False positive rate is the percent the model incorrectly predicts the class
# True positive rate is the percent the model correctly predicts the class
# Precision is the percent of the correctly prediction out of all times
# Recall is the percent that the model correctly identifies out of all the actual class instances)
# F-measure is the harmonic mean of precision and recall, balancing both metrics.
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: {0}\nFPR: {1}\nTPR: {2}\nF-measure: {3}\nPrecision: {4}\nRecall: {5}".format \
      (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.8629032258064516
FPR: 0.07733963328083378
TPR: 0.8629032258064517
F-measure: 0.8560963098770938
Precision: 0.8953579858379228
Recall: 0.8629032258064517


In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy)) # test error means the percent the model misclassified of the test data

Test Error = 0.192308


## There are 3 lines determines 3 classes (3 predictive values), hence, 3 slopes and 3 intercepts.

In [22]:
# model.coefficientMatrix represents the weights for each feature for each of the 3 classes
# positive coefficient increases the likelihood of predicting the class while negative coefficient decreases it.
model.coefficientMatrix.toDense()
# for class 1, feature 2 has a coefficent of -0.7575, feature 3 has a coefficient of -0.292.
# for class 2, feature 3 has a coefficent of 0.3665.

DenseMatrix(3, 4, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.8008, 0.0, 0.3017, -0.3658, 0.0], False)

In [23]:
# intercepts for class 0, 1, and 2
model.interceptVector

DenseVector([0.0007, -0.1594, 0.1587])