In [18]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from helpers.helper_functions import translate_to_file_string
from pyspark.ml.classification import MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [19]:
inputFile = translate_to_file_string("../data/heart_val.csv")

In [20]:
spark = (SparkSession
       .builder
       .appName("HeartDiseaseAnalNeuralNw")
       .getOrCreate())

In [21]:
# load data file.
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)


In [22]:
#transform labels
labelIndexer = StringIndexer().setInputCol("target").setOutputCol("label").fit(df)
sexIndexer = StringIndexer().setInputCol("sex").setOutputCol("sex_num").fit(df)

In [23]:
#feature columns for evaluation
featureCols = df.columns.copy()
featureCols.remove("target")
featureCols.remove("sex")
featureCols = featureCols + ["sex_num"]

In [24]:
#create vector assembler of feature columns
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [25]:
#Build feauture Indexer 
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6)

In [26]:
#Create scaler for scaled output
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

In [27]:
#Convert Indexed labels back to original labels
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

In [28]:
#create Classifier 
nn = MultilayerPerceptronClassifier(seed=1234, featuresCol="scaledFeatures")
   
# build network parameters grid
		

paramGrid =  ParamGridBuilder().addGrid(nn.layers, [[ len(featureCols), 10, 5, 2 ]]) \
				.addGrid(nn.blockSize,  [128 ]) \
                .addGrid(nn.maxIter,[ 100, 1000, 5000 ] )\
				.addGrid(nn.stepSize, [0.003, 0.03, 0.3 ])\
				.addGrid(nn.tol, [ 0.05, 0.1, 0.2 ]) \
				.build()

In [29]:
#split data for etsting

splits = df.randomSplit([0.7, 0.3 ], 5433)
train = splits[0]
test = splits[1]

In [30]:
#Pipelining of all steps 
pipeline = Pipeline(stages= [labelIndexer,sexIndexer,  assembler, featureIndexer, scaler, nn , predConverter])

In [32]:
#build evaluator 
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")

In [33]:
#Cross validator
cv = CrossValidator(estimator=pipeline, evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=10, parallelism=2)

In [34]:
#train model
nwModel = cv.fit(train)

In [35]:
#Find out the best model
bestModel = nwModel.bestModel.stages[5]
print("Layers: " , bestModel.layers)
print(bestModel.explainParams())

Layers:  MultilayerPerceptronClassifier_b844c5104df1__layers
blockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 128, current: 128)
featuresCol: features column name. (default: features, current: scaledFeatures)
initialWeights: The initial weights of the model. (undefined)
labelCol: label column name. (default: label)
layers: Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons. (current: [13, 10, 5, 2])
maxIter: max number of iterations (>= 0). (default: 100, current: 100)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not 

In [36]:
#test model
predictions = nwModel.transform(test)
predictions.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|label|sex_num|            features|     indexedFeatures|      scaledFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+
| 35|  f|  0|     138| 183|  0|      1|    182|    0|    1.4|    2|  0|   2|     y|  0.0|    1.0|[35.0,0.0,138.0,1...|[35.0,0.0,138.0,1...|[3.90265392446968...|[0.75600387078402...|[0.56268825971894...|       0.0|             y|
| 35|  m|  0|     120| 198|  0|      1|    130|    1|    1.6|    1|  0|   3|     n| 

In [37]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.3524970963995354


In [38]:
spark.stop()