## Linear Support Vector Classifier (SVC)

In [32]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [33]:
# one time run on the PC; need to run it everytime on colab and databrick notebooks
!pip install -U --quiet scikit-learn

distutils: /home/hadoop/.local/lib/python3.9/site-packages
sysconfig: /home/hadoop/.local/lib64/python3.9/site-packages[0m
user = True
home = None
root = None
prefix = None[0m


In [34]:
from pyspark.sql import SQLContext
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix

In [35]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

25/03/03 20:40:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Linear Support Vector Machine with pyspark
import data

In [36]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day', 'month', 'poutcome').show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [37]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



## Deal with categorical data and Convert the data to dense vector

In [38]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
num_cols = ['balance', 'duration', 'campaign', 'pdays', 'previous']
labelCol = 'y'

### Process categorical columns
The following code does three things with pipeline:<br>
* StringIndexer all categorical columns<br>
* OneHotEncoder all categorical index columns<br>
* VectorAssembler all feature columns into one vector column

Categorical columns

In [39]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = cat_cols

In [40]:
# using StringIndexer to convert categorical columns into numeric indices
indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns]

In [41]:
# conver indexed categorical columns into sparse vectors
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

In [42]:
# combin numerical and encoded categorical columns into a single feature vector for training
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                        for encoder in encoders] + num_cols, outputCol="features")

In [43]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model = pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label', col(labelCol))
data = data.select('features', 'label')
data.show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,-1.0])                |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,-1.0])               |no   |
|(29,[1,11,13,16,17,18,20,21,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,226.0,1.0,-1.0])                  |no   |
+---------------------------------------------------------------------------------------

### We need to deal with label, which is string, yes or no, need to make them numbers

Build StringIndexer stages

In [44]:
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [45]:
data.show(5)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[8,11,15,16,1...|   no|         0.0|
|(29,[4,11,13,16,1...|   no|         0.0|
|(29,[0,12,14,16,1...|   no|         0.0|
|(29,[0,11,14,16,1...|   no|         0.0|
|(29,[1,11,13,16,1...|   no|         0.0|
+--------------------+-----+------------+
only showing top 5 rows



In [46]:
from pyspark.ml.feature import VectorIndexer
# use VectorIndexer to automatically identify categorical features, and index them
# set maxCategories so features with >4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=3).fit(data)

In [47]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|label|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



### Split the data to training and test data sets

In [48]:
# split the data into training and test sets (40% held out for testing)
(train_data, test_data) = data.randomSplit([0.6, 0.4])
train_data.show(5, False)
test_data.show(5, False)

+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|features                                                                                        |label|indexedLabel|indexedFeatures                                                                                 |
+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,117.0,635.0,1.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,1

### Build cross-validation model

In [49]:
# LinearSVC model
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(featuresCol="indexedFeatures", labelCol="indexedLabel", maxIter=50)

In [50]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

In [51]:
# chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[lsvc, labelConverter])
# train model, this also runs the indexers
lsvcModel = pipeline.fit(train_data)

### Make predictions

In [52]:
# make predictions
predictions = lsvcModel.transform(test_data)
# select example rows to display
predictions.show(5)

+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
|            features|label|indexedLabel|     indexedFeatures|       rawPrediction|prediction|predictedLabel|
+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.0650679352406,...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.05898663021228...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[0.96445812005862...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.05340940108253...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.03711121871714...|       0.0|            no|
+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
only showi

### Evaluation

In [53]:
# use MulticlassClassificationEvaluator to evaluate the model's accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test error = %g" % (1.0 - accuracy))

Accuracy = 0.8961407491486947
Test error = 0.103859


### Get confusion matrix

In [54]:
y_pred = predictions.select("prediction").collect()
y_orig = predictions.select("indexedLabel").collect()

In [55]:
# generate confusion matrix to evalutate the classification performance
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix: ")
print(cm)

Confusion Matrix: 
[[1548   18]
 [ 165   31]]


### Here is the slope of the hyper-plane

In [56]:
lsvcModel.stages[0].coefficients

DenseVector([-0.063, -0.059, -0.0497, -0.054, -0.0668, -0.0573, -0.0441, -0.0567, -0.0624, -0.0606, -0.0451, -0.0212, -0.0148, 0.0052, 0.0175, 0.0016, -0.0308, -0.0042, 0.0235, -0.0082, -0.0388, -2.0486, -1.9968, -2.0, 0.0, 0.0001, 0.0001, -0.0002, -0.0021])

### Here is intercept of the hyper-plane

In [57]:
lsvcModel.stages[0].intercept

1.07398449000531

### Tear down machine learning pipeline

In [58]:
# Stop session
sc.stop()