In [0]:
# Read and load the new data file into a dataframe
newstrokeDF = spark.read.format('csv').option("inferSchema", True).option("header", True).option("sep",',').load("/FileStore/tables/SparkMLib/newheartstroke.csv")

In [0]:
# Check the schema and first few records
newstrokeDF.printSchema()
newstrokeDF.show(5, False)

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- diabetes: double (nullable = true)
 |-- hypertension: double (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- BMI: double (nullable = true)

+------+----+--------+------------+-------------+---------------+-----+
|gender|age |diabetes|hypertension|heart_disease|smoking_history|BMI  |
+------+----+--------+------------+-------------+---------------+-----+
|Female|28.0|0.0     |0.0         |No           |never          |28.48|
|Male  |5.0 |0.0     |0.0         |No           |never          |19.29|
|Female|50.0|1.0     |0.0         |No           |never          |35.91|
|Female|32.0|0.0     |0.0         |No           |never          |39.17|
|Female|22.0|0.0     |0.0         |No           |former         |22.48|
+------+----+--------+------------+-------------+---------------+-----+
only showing top 5 rows



In [0]:
# Note you can use display funciton only in Databricks which displays records in a tabular form
display(newstrokeDF)

gender,age,diabetes,hypertension,heart_disease,smoking_history,BMI
Female,28.0,0.0,0.0,No,never,28.48
Male,5.0,0.0,0.0,No,never,19.29
Female,50.0,1.0,0.0,No,never,35.91
Female,32.0,0.0,0.0,No,never,39.17
Female,22.0,0.0,0.0,No,former,22.48
Female,43.0,0.0,0.0,No,current,23.79
Male,24.0,0.0,0.0,No,never,28.6
Female,67.0,1.0,0.0,Yes,former,32.31
Male,41.0,0.0,0.0,No,former,29.81
Female,52.0,0.0,0.0,No,never,23.63


In [0]:
# Check the count
newstrokeDF.count()

Out[4]: 14

In [0]:
# Get a summary description of the dataframe
newstrokeDF.describe().show()

+-------+------+------------------+-------------------+------------+-------------+---------------+-----------------+
|summary|gender|               age|           diabetes|hypertension|heart_disease|smoking_history|              BMI|
+-------+------+------------------+-------------------+------------+-------------+---------------+-----------------+
|  count|    14|                14|                 14|          14|           14|             14|               14|
|   mean|  null|38.214285714285715|0.14285714285714285|         0.0|         null|           null|27.68142857142857|
| stddev|  null|16.655692357632205|0.36313651960128146|         0.0|         null|           null|5.455286578022428|
|    min|Female|               5.0|                0.0|         0.0|           No|        current|            19.29|
|    max|  Male|              67.0|                1.0|         0.0|          Yes|          never|            39.17|
+-------+------+------------------+-------------------+---------

In [0]:
# There are no records with any column as null
# If so we will have to drop those records
# Now we can load the model and apply it on this new data

In [0]:
# ##### Loading pipeline model

# import PipelineModel from pyspark.ml package
from pyspark.ml import PipelineModel

# load the model from the location it is stored
# Let us call the loaded model as pipelinemodel
pipelinemodel = PipelineModel.load("/FileStore/tables/SparkMLib/dtcpipelinemodel")

In [0]:
# use the PipelineModel object to perform prediciton on the new data. 
# Use .transform() to perfrom prediction
prediction = pipelinemodel.transform(newstrokeDF)

In [0]:
prediction.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- diabetes: double (nullable = true)
 |-- hypertension: double (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- BMI: double (nullable = true)
 |-- BodyType: double (nullable = true)
 |-- ageGroup: double (nullable = true)
 |-- gender_indexed: double (nullable = false)
 |-- heart_disease_indexed: double (nullable = false)
 |-- smoking_history_indexed: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
# As we see the the newstroke dataframe is transformed by the pipelinemodel. Hence
# binarizer added BodyType column
# bucketizer added ageGroup column
# stringindexers added the respective indexes
# vectorassembler added the vector column with the features
# Finally dtc added the rawPrediction, probability and prediction columns

In [0]:
prediction.show(5, False)

+------+----+--------+------------+-------------+---------------+-----+--------+--------+--------------+---------------------+-----------------------+-------------------------+---------------+-----------------------------------------+----------+
|gender|age |diabetes|hypertension|heart_disease|smoking_history|BMI  |BodyType|ageGroup|gender_indexed|heart_disease_indexed|smoking_history_indexed|features                 |rawPrediction  |probability                              |prediction|
+------+----+--------+------------+-------------+---------------+-----+--------+--------+--------------+---------------------+-----------------------+-------------------------+---------------+-----------------------------------------+----------+
|Female|28.0|0.0     |0.0         |No           |never          |28.48|0.0     |1.0     |0.0           |0.0                  |0.0                    |(7,[3],[1.0])            |[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|Male  |5.0 |0.0

In [0]:
# print the results
prediction.select('rawPrediction', 'probability', 'prediction').show(truncate=False)

+---------------+-----------------------------------------+----------+
|rawPrediction  |probability                              |prediction|
+---------------+-----------------------------------------+----------+
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[1971

In [0]:
display(prediction.select('rawPrediction', 'probability', 'prediction').show(truncate=False))

+---------------+-----------------------------------------+----------+
|rawPrediction  |probability                              |prediction|
+---------------+-----------------------------------------+----------+
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[13020.0,303.0]|[0.9772573744652105,0.02274262553478946] |0.0       |
|[19717.0,53.0] |[0.9973191704602934,0.002680829539706626]|0.0       |
|[1971