#### Spark ML + Spark Structured Streaming

In [None]:
import org.apache.spark.sql.types.{StructType,LongType}
import org.apache.spark.ml.feature.{OneHotEncoder, VectorAssembler, MinMaxScaler, StringIndexer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression

val schema = new StructType()
      .add("age",LongType,true)
      .add("sex",LongType,true)
      .add("cp",LongType,true)
      .add("trtbps",LongType,true)
      .add("chol",LongType,true)
      .add("fbs",LongType,true)
      .add("restecg",LongType,true)
      .add("thalachh",LongType,true)
      .add("exng",LongType,true)
      .add("oldpeak",LongType,true)
      .add("slp",LongType,true)
      .add("caa",LongType,true)
      .add("thall",LongType,true)
      .add("output",LongType,true)
      
val heartdF = spark.read.format("csv")
      .option("header", "true")
      .schema(schema)
      .load("file:///tmp/spark_ml")
      .withColumnRenamed("output","label")

println(heartdF.count)
heartdF.printSchema()

In [None]:
heartdF.filter("oldpeak is null").count

In [None]:
val Array(trainDF, testDF) = heartdF.randomSplit(weights=Array(.8, .2))

In [None]:

val lr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.01)

val oneHotEnc = new OneHotEncoder()
.setInputCols(Array("sex", "cp", "fbs", "restecg", "exng", "slp", "caa","thall"))
.setOutputCols(Array("SexOHE", "cpOHE", "fbsOHE", "restecgOHE", "exngOHE", "slpOHE", "caaOHE","thallOHE"))

val assemblerA = new VectorAssembler()
  .setInputCols(Array("age", "trtbps", "chol", "thalachh", "oldpeak"))
  .setOutputCol("features_scaled1")
  .setHandleInvalid("skip")
 
val scaler = new MinMaxScaler()
  .setInputCol("features_scaled1")
  .setOutputCol("features_scaled")
 
val assemblerB = new VectorAssembler()
  .setInputCols(Array("SexOHE", "cpOHE", "fbsOHE", "restecgOHE", "exngOHE", "slpOHE", "caaOHE","thallOHE", "features_scaled"))
  .setOutputCol("features")
  .setHandleInvalid("skip")
 
val modelStages = Array(assemblerA, scaler, oneHotEnc, assemblerB, lr)

val pipeline = new Pipeline()
  .setStages(modelStages)

val PipelineModel = pipeline.fit(trainDF)
 
val trainingPred = PipelineModel.transform(trainDF)

trainingPred.select("label","probability","prediction").show(truncate=false)


In [None]:
testDF.repartition(10)
    .write.format("csv")
    .option("header", true)
    .mode("overwrite")
    .save("file:///tmp/spark_ml_streaming/")


In [None]:

val streamingSource=spark
    .readStream
    .format("csv")
    .option("header",true)
    .schema(schema)
    .option("ignoreLeadingWhiteSpace",true)
    .option("mode","dropMalformed")
    .option("maxFilesPerTrigger",1)
    .load("file:///tmp/HeartTest/")
    .withColumnRenamed("output","label")

In [None]:
val streamingHeart = PipelineModel.transform(streamingSource).select("label","probability","prediction")

streamingHeart.writeStream
    .outputMode("append")
    .option("truncate", false)
    .format("console")
    .start()
    .awaitTermination()

#### Model validation metrics

Calculate the true positive and true negative rates (sensitivity and specificity of the model respectively)

In [None]:
import org.apache.spark.sql.functions.{count, sum, when}

val streamingRates = PipelineModel.transform(streamingSource)
    .groupBy('label)
    .agg(
        (sum(when('prediction === 'label, 1)) / count('label)).alias("true prediction rate"),
        count('label).alias("count")
        )

streamingRates.writeStream
    .outputMode("complete")
    .option("truncate", false)
    .format("console")
    .start()
    .awaitTermination()

