In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from helpers.helper_functions import translate_to_file_string
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
inputFile = translate_to_file_string("../data/heart_val.csv")

In [3]:
spark = (SparkSession
       .builder
       .appName("HeartDiseaseAnalSVM")
       .getOrCreate())

In [4]:
# load data file.
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

#Pandas df for visualization
dfp = df.toPandas()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: string (nullable = true)

None


In [5]:
#transform labels
labelIndexer = StringIndexer().setInputCol("target").setOutputCol("label").fit(df)
sexIndexer = StringIndexer().setInputCol("sex").setOutputCol("sex_num").fit(df)

In [6]:
#feature columns
featureCols = df.columns.copy()
featureCols.remove("target")
featureCols.remove("sex")
featureCols = featureCols + ["sex_num"]

In [7]:
#vector assembler
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

In [8]:
#Build feauture Indexer 
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6)

In [9]:
#Convert Indexed labels back to original labels
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

In [10]:
lsvc = LinearSVC(labelCol="label",aggregationDepth=2, featuresCol="features" )

In [11]:
# build a Parameter Grip for testing
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1, 0.001, 0.0001]) \
                                 .addGrid(lsvc.standardization, [True, False]) \
                                 .build()

In [12]:
#split data for testing

splits = df.randomSplit([0.8, 0.2 ], 1234)
train = splits[0]
test = splits[1]

In [13]:
#Pipelining of all steps 
pipeline = Pipeline(stages= [labelIndexer,sexIndexer,  assembler, featureIndexer, lsvc , predConverter])

In [15]:
#build evaluator 
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")

In [16]:
#Cross validator
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=5, parallelism=4)

In [17]:
#train model
cvSVMModel = cvSVM.fit(train)

In [18]:
#Find out the best model
linearSVCModel = cvSVMModel.bestModel.stages[4] # the stage at index 4 in the pipeline is the SVMModel
print("Best Params: \n", linearSVCModel.explainParams())
print("Param Map: \n", linearSVCModel.extractParamMap())

Best Params: 
 aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2, current: 2)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
maxIter: max number of iterations (>= 0). (default: 100, current: 100)
predictionCol: prediction column name. (default: prediction)
rawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.1)
standardization: whether to standardize the training features before fitting the model. (default: True, current: True)
threshold: The threshold in binary classification applied to the linear model prediction.  This threshold can be any real number, where Inf will make all predictions 0.0 and -Inf will make all predictions 1.0. (default: 0.0)
tol: the convergence tolerance for iterative algorithms (>= 0

In [19]:
#test model
predictions = cvSVMModel.transform(test)
predictions.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+----------+--------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|label|sex_num|            features|     indexedFeatures|       rawPrediction|prediction|predictedLabel|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+----------+--------------+
| 34|  f|  1|     118| 210|  0|      1|    192|    0|    0.7|    2|  0|   2|     y|  0.0|    1.0|[34.0,1.0,118.0,2...|[34.0,1.0,118.0,2...|[2.30823258493557...|       0.0|             y|
| 35|  m|  0|     120| 198|  0|      1|    130|    1|    1.6|    1|  0|   3|     n|  1.0|    0.0|[35.0,0.0,120.0,1...|[35.0,0.0,120.0,1...|[-1.1022514592139...|       1.0|             n|
| 41|  f|  1|     130| 204|  0|      0|    172|    0|    1.4|    

In [20]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.09717607973421916


In [21]:
#confusion matrix

predictionAndLabels = predictions.select("prediction", "label").rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)

In [22]:
confusion = metrics.confusionMatrix()
print("Confusion matrix: \n" , confusion)

Confusion matrix: 
 DenseMatrix([[40.,  3.],
             [ 8., 20.]])


In [23]:
##statistics per label

labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
print(labels)
for label in  labels:
  print("Class %f precision = %f\n" % (label , metrics.precision(label)))
  print("Class %f recall = %f\n" % (label, metrics.recall(label)))
  print("Class %f F1 score = %f\n" % (label, metrics.fMeasure( label)))

[0.0, 1.0]
Class 0.000000 precision = 0.833333

Class 0.000000 recall = 0.930233

Class 0.000000 F1 score = 0.879121

Class 1.000000 precision = 0.869565

Class 1.000000 recall = 0.714286

Class 1.000000 F1 score = 0.784314



In [24]:
#weighted stats
print("Weighted precision = %s\n" % metrics.weightedPrecision)
print("Weighted recall = %s\n" % metrics.weightedRecall)
print("Weighted false positive rate = %s\n" % metrics.weightedFalsePositiveRate)

Weighted precision = 0.8476219636660544

Weighted recall = 0.8450704225352113

Weighted false positive rate = 0.20055215010996208



In [25]:
#summary of stats
print(f"Recall = {metrics.recall(1.0)}")
print(f"Precision = {metrics.precision(1.0)}")
print(f"Accuracy = {metrics.accuracy}") 
print(f"F1 = {metrics.fMeasure(1.0)}")

Recall = 0.7142857142857143
Precision = 0.8695652173913043
Accuracy = 0.8450704225352113
F1 = 0.7843137254901961


In [26]:
spark.stop()