# Machine Learning con PySpark

* Diego Andrés Alonzo Medinilla

In [58]:
import findspark
findspark.init()

import pandas as pd
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local[16]")\
        .appName('Machine_Learning_w_PySpark')\
        .getOrCreate()

spark



In [59]:
from pyspark.sql.types import *
titanicSchema = StructType([
StructField("age", IntegerType(), True),
StructField("sex", IntegerType(), True),
StructField("ChesPainType", IntegerType(), True),
StructField("RestingBloodPressureInmmHg", FloatType(), True),
StructField("CholesterolMgPerDL", FloatType(), True),
StructField("FastingBloodSugar", FloatType(), True),
StructField("RestingElectrocardiographicResults", IntegerType(), True),
StructField("MaximumHeartRateBasedOnThaliumTest", FloatType(), True),
StructField("Exang", IntegerType(), True),
StructField("STDepression", FloatType(), True),
StructField("SlopeOfThePeakExercise", IntegerType(), True),
StructField("MajorVessels", IntegerType(), True),
StructField("ThalassemiaStatus", IntegerType(), True),
StructField("PresenceOfHeartDisease", IntegerType(), True),
])
data = spark.read.csv(
    "./data/heart.csv",
    sep = ',',
    header = True,
    schema = titanicSchema
    )
data.show()

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|Exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
| 63|  1|           3|                     145.0|             233.0|              1.0|                                 0|                             150.0|    0|         2.3|                     0|           0|                1|            

In [60]:
data.select("exang").distinct().show()
data.select("RestingElectrocardiographicResults").distinct().show()
data.select("sex").distinct().show()
data.select("FastingBloodSugar").distinct().show()

+-----+
|exang|
+-----+
|    1|
|    0|
+-----+

+----------------------------------+
|RestingElectrocardiographicResults|
+----------------------------------+
|                                 1|
|                                 2|
|                                 0|
+----------------------------------+

+---+
|sex|
+---+
|  1|
|  0|
+---+

+-----------------+
|FastingBloodSugar|
+-----------------+
|              1.0|
|              0.0|
+-----------------+



In [61]:
# Load csv correctly
from pyspark.sql.functions import col
data = data.withColumn("sex",col("sex").cast(BooleanType()))\
        .withColumn("exang", col("exang").cast(BooleanType()))\
        .withColumn("PresenceOfHeartDisease", col("PresenceOfHeartDisease").cast(BooleanType()))\
        .withColumn("FastingBloodSugar", col("exang").cast(BooleanType()))
data.printSchema()
data.show()

root
 |-- age: integer (nullable = true)
 |-- sex: boolean (nullable = true)
 |-- ChesPainType: integer (nullable = true)
 |-- RestingBloodPressureInmmHg: float (nullable = true)
 |-- CholesterolMgPerDL: float (nullable = true)
 |-- FastingBloodSugar: boolean (nullable = true)
 |-- RestingElectrocardiographicResults: integer (nullable = true)
 |-- MaximumHeartRateBasedOnThaliumTest: float (nullable = true)
 |-- exang: boolean (nullable = true)
 |-- STDepression: float (nullable = true)
 |-- SlopeOfThePeakExercise: integer (nullable = true)
 |-- MajorVessels: integer (nullable = true)
 |-- ThalassemiaStatus: integer (nullable = true)
 |-- PresenceOfHeartDisease: boolean (nullable = true)

+---+-----+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|  sex|ChesPainType|RestingBloodPressureInmm

In [62]:
data.describe().toPandas()

Unnamed: 0,summary,age,ChesPainType,RestingBloodPressureInmmHg,CholesterolMgPerDL,RestingElectrocardiographicResults,MaximumHeartRateBasedOnThaliumTest,STDepression,SlopeOfThePeakExercise,MajorVessels,ThalassemiaStatus
0,count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
1,mean,54.366336633663366,0.966996699669967,131.62376237623764,246.26402640264027,0.528052805280528,149.64686468646866,1.0396039587977302,1.3993399339933994,0.7293729372937293,2.3135313531353137
2,stddev,9.08210098983786,1.0320524894832983,17.5381428135171,51.83075098793005,0.525859596359298,22.90516111491409,1.1610750102689429,0.6162261453459622,1.0226063649693276,0.6122765072781408
3,min,29.0,0.0,94.0,126.0,0.0,71.0,0.0,0.0,0.0,0.0
4,max,77.0,3.0,200.0,564.0,2.0,202.0,6.2,2.0,4.0,3.0


In [63]:
%pip install pandas_profiling
%pip install ydata-profiling

Collecting visions[type_image_path]==0.7.4 (from pandas_profiling)
  Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.5
    Uninstalling visions-0.7.5:
      Successfully uninstalled visions-0.7.5
Successfully installed visions-0.7.4
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.6.2 requires visions[type_image_path]==0.7.5, but you have visions 0.7.4 which is incompatible.


Collecting visions[type_image_path]==0.7.5 (from ydata-profiling)
  Using cached visions-0.7.5-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.4
    Uninstalling visions-0.7.4:
      Successfully uninstalled visions-0.7.4
Successfully installed visions-0.7.5
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires visions[type_image_path]==0.7.4, but you have visions 0.7.5 which is incompatible.


In [64]:
# Notice that profile with pandas_profiling will be deprecated in 1st April.
from pandas_profiling import ProfileReport
data_p = data.toPandas()
profile = ProfileReport(data_p)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [65]:
# Nota: ydata_profiling es la nueva version de pandas profiling, no tiene clavos a diferencias de pandas profiling, para hacer la revisión del dataset.
from ydata_profiling import ProfileReport
data_p = data.toPandas()
profile1 = ProfileReport(data_p)
profile1

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [66]:
# saving to html format
profile.to_file("profile_pandas.html")

# saving to json format
profile.to_file("profile_pandas.json")
# saving to html format
profile1.to_file("profile_ydata.html")

# saving to json format
profile1.to_file("profile_ydata.json")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [67]:
# count when is null
from pyspark.sql.functions import *
data.select("*").show()

+---+-----+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|  sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+-----+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
| 63| true|           3|                     145.0|             233.0|            false|                                 0|                             150.0|false|         2.3|                     0|           0|                1|    

In [68]:
data.where(
        isnull("age")
        | isnull("sex")
        | isnull("RestingBloodPressureInmmHg")
        | isnull("CholesterolMgPerDL") 
        | isnull("FastingBloodSugar")
        | isnull("RestingElectrocardiographicResults")
        | isnull("MaximumHeartRateBasedOnThaliumTest")
        | isnull("exang")
        | isnull("STDepression")
        | isnull("SlopeOfThePeakExercise")
        | isnull("MajorVessels")
        | isnull("ThalassemiaStatus")
        | isnull("PresenceOfHeartDisease")
        ).show()
# So as profiling showed, there are all complete values.
data.select("*").show()

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+------------

In [69]:
data = data.withColumn("sex",col("sex").cast(StringType()))\
        .withColumn("exang", col("exang").cast(StringType()))\
        .withColumn("PresenceOfHeartDisease", col("PresenceOfHeartDisease").cast(StringType()))\
        .withColumn("FastingBloodSugar", col("exang").cast(StringType()))
        
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols=[
                        "sex", 
                        "FastingBloodSugar",
                        "exang",
                        "PresenceOfHeartDisease"
                        ],
              outputCols=[
                        "sex1", 
                        "FastingBloodSugar1",
                        "exang1",
                        "PresenceOfHeartDisease1"
                        ])
indexerModel = indexer.fit(data)
indexer_df = indexerModel.transform(data)
indexer_df.show()

+---+-----+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+----+------------------+------+-----------------------+
|age|  sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|sex1|FastingBloodSugar1|exang1|PresenceOfHeartDisease1|
+---+-----+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+----+------------------+------+-----------------------+
| 63| true|           3|                     145.0|             233.0|

In [70]:
data = indexer_df.select("age",
                         col("sex1").alias("sex"),
                         "ChesPainType",
                         "RestingBloodPressureInmmHg",
                         "CholesterolMgPerDL",
                         col("FastingBloodSugar1").alias("FastingBloodSugar"),
                         "RestingElectrocardiographicResults",
                         "MaximumHeartRateBasedOnThaliumTest",
                         col("exang1").alias("exang"),
                         "STDepression",
                         "SlopeOfThePeakExercise",
                         "MajorVessels",
                         "ThalassemiaStatus",
                         col("PresenceOfHeartDisease1").alias("PresenceOfHeartDisease")
                         )
data.show()

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
| 63|0.0|           3|                     145.0|             233.0|              0.0|                                 0|                             150.0|  0.0|         2.3|                     0|           0|                1|            

In [71]:
data.dtypes

[('age', 'int'),
 ('sex', 'double'),
 ('ChesPainType', 'int'),
 ('RestingBloodPressureInmmHg', 'float'),
 ('CholesterolMgPerDL', 'float'),
 ('FastingBloodSugar', 'double'),
 ('RestingElectrocardiographicResults', 'int'),
 ('MaximumHeartRateBasedOnThaliumTest', 'float'),
 ('exang', 'double'),
 ('STDepression', 'float'),
 ('SlopeOfThePeakExercise', 'int'),
 ('MajorVessels', 'int'),
 ('ThalassemiaStatus', 'int'),
 ('PresenceOfHeartDisease', 'double')]

In [72]:
required_features = [
    "age",
    "sex",
    "ChesPainType",
    "RestingBloodPressureInmmHg",
    "CholesterolMgPerDL",
    "FastingBloodSugar",
    "RestingElectrocardiographicResults",
    "MaximumHeartRateBasedOnThaliumTest",
    "exang",
    "STDepression",
    "SlopeOfThePeakExercise",
    "MajorVessels",
    "ThalassemiaStatus"
]
from pyspark.ml.feature import VectorAssembler
vec_assembler = VectorAssembler(inputCols=required_features, outputCol="features")

In [73]:
transformed_data = vec_assembler.transform(data)

In [74]:
# transformed_data.show()
transformed_data.select("features").show()

+--------------------+
|            features|
+--------------------+
|(13,[0,2,3,4,7,9,...|
|[37.0,0.0,2.0,130...|
|[41.0,1.0,1.0,130...|
|[56.0,0.0,1.0,120...|
|[57.0,1.0,0.0,120...|
|[57.0,0.0,0.0,140...|
|[56.0,1.0,1.0,140...|
|[44.0,0.0,1.0,120...|
|[52.0,0.0,2.0,172...|
|[57.0,0.0,2.0,150...|
|[54.0,0.0,0.0,140...|
|[48.0,1.0,2.0,130...|
|[49.0,0.0,1.0,130...|
|[64.0,0.0,3.0,110...|
|[58.0,1.0,3.0,150...|
|[50.0,1.0,2.0,120...|
|[58.0,1.0,2.0,120...|
|[66.0,1.0,3.0,150...|
|[43.0,0.0,0.0,150...|
|[69.0,1.0,3.0,140...|
+--------------------+
only showing top 20 rows



In [75]:
transformed_data.head()
vector_col = col("features").getItem(0)
transformed_data.select(vector_col).show()
# data_final = transformed_data.select("features", "PresenceOfHeartDisease")
# train, test = data_final.randomSplit([0.7,0.3])
# data_final.schema

AnalysisException: [INVALID_EXTRACT_BASE_FIELD_TYPE] Can't extract a value from "features". Need a complex type [STRUCT, ARRAY, MAP] but got "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>".

In [None]:
train.show()

+--------------------+----------------------+
|            features|PresenceOfHeartDisease|
+--------------------+----------------------+
|(13,[0,1,2,3,4,7,...|                   0.0|
|(13,[0,1,3,4,7,10...|                   0.0|
|(13,[0,1,3,4,7,10...|                   1.0|
|(13,[0,2,3,4,7,9,...|                   0.0|
|(13,[0,2,3,4,7,9,...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   1.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,3,4,6,7,10...|                   0.0|
|(13,[0,3,4,7,9,10...|                   1.0|
|(13,[0,3,4,7,9,10...|            

In [None]:
test.show()

+--------------------+----------------------+
|            features|PresenceOfHeartDisease|
+--------------------+----------------------+
|(13,[0,1,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,6,7,...|                   0.0|
|(13,[0,2,3,4,7,9,...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,2,3,4,7,10...|                   0.0|
|(13,[0,3,4,6,7,10...|                   1.0|
|(13,[0,3,4,6,7,10...|                   0.0|
|(13,[0,3,4,7,9,10...|                   1.0|
|(13,[0,3,4,7,10,1...|                   1.0|
|(13,[0,3,4,7,10,1...|                   1.0|
|(13,[0,3,4,7,10,1...|                   1.0|
|(13,[0,3,4,7,10,1...|                   0.0|
|[35.0,0.0,0.0,126...|                   1.0|
|[37.0,0.0,2.0,130...|                   0.0|
|[38.0,0.0,2.0,138...|                   0.0|
|[40.0,0.0,0.0,110...|                   1.0|
|[40.0,0.0,3.0,140...|                   0.0|
|[41.0,0.0,1.0,110...|                   0.0|
|[41.0,0.0,2.0,130...|            

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="target",
                        featuresCol="features")
model = lr.fit(train)
# trainingSummary = lrModel.summary
# print(trainingSummary)


IllegalArgumentException: target does not exist. Available: features, PresenceOfHeartDisease

# Correction of loading

In [76]:
titanicSchema = StructType([
StructField("age", IntegerType(), True),
StructField("sex", IntegerType(), True),
StructField("ChesPainType", IntegerType(), True),
StructField("RestingBloodPressureInmmHg", FloatType(), True),
StructField("CholesterolMgPerDL", FloatType(), True),
StructField("FastingBloodSugar", FloatType(), True),
StructField("RestingElectrocardiographicResults", IntegerType(), True),
StructField("MaximumHeartRateBasedOnThaliumTest", FloatType(), True),
StructField("Exang", IntegerType(), True),
StructField("STDepression", FloatType(), True),
StructField("SlopeOfThePeakExercise", IntegerType(), True),
StructField("MajorVessels", IntegerType(), True),
StructField("ThalassemiaStatus", IntegerType(), True),
StructField("PresenceOfHeartDisease", IntegerType(), True),
])
data_f = spark.read.csv(
    "./data/heart.csv",
    sep = ',',
    header = True,
    schema = titanicSchema
    )
data_f.show()

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|Exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
| 63|  1|           3|                     145.0|             233.0|              1.0|                                 0|                             150.0|    0|         2.3|                     0|           0|                1|            

In [77]:

data_pandas = data_f.toPandas()
profile2 = ProfileReport(data_pandas)
profile2

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [87]:
data_f.where(
        isnull("age")
        | isnull("sex")
        | isnull("RestingBloodPressureInmmHg")
        | isnull("CholesterolMgPerDL") 
        | isnull("FastingBloodSugar")
        | isnull("RestingElectrocardiographicResults")
        | isnull("MaximumHeartRateBasedOnThaliumTest")
        | isnull("exang")
        | isnull("STDepression")
        | isnull("SlopeOfThePeakExercise")
        | isnull("MajorVessels")
        | isnull("ThalassemiaStatus")
        | isnull("PresenceOfHeartDisease")
        ).show()
# So as profiling showed, there are all complete values.
data_f.select("*").show()
data_f.dtypes

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|Exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+------------

[('age', 'int'),
 ('sex', 'int'),
 ('ChesPainType', 'int'),
 ('RestingBloodPressureInmmHg', 'float'),
 ('CholesterolMgPerDL', 'float'),
 ('FastingBloodSugar', 'float'),
 ('RestingElectrocardiographicResults', 'int'),
 ('MaximumHeartRateBasedOnThaliumTest', 'float'),
 ('Exang', 'int'),
 ('STDepression', 'float'),
 ('SlopeOfThePeakExercise', 'int'),
 ('MajorVessels', 'int'),
 ('ThalassemiaStatus', 'int'),
 ('PresenceOfHeartDisease', 'int')]

In [93]:
required_features2 = [
    "age",
    "sex",
    "ChesPainType",
    "RestingBloodPressureInmmHg",
    "CholesterolMgPerDL",
    "FastingBloodSugar",
    "RestingElectrocardiographicResults",
    "MaximumHeartRateBasedOnThaliumTest",
    "Exang",
    "STDepression",
    "SlopeOfThePeakExercise",
    "MajorVessels",
    "ThalassemiaStatus"
]
vec_assembler1 = VectorAssembler(inputCols=required_features2, outputCol="features")



In [92]:
data_f.select("*").show(5)

+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
|age|sex|ChesPainType|RestingBloodPressureInmmHg|CholesterolMgPerDL|FastingBloodSugar|RestingElectrocardiographicResults|MaximumHeartRateBasedOnThaliumTest|Exang|STDepression|SlopeOfThePeakExercise|MajorVessels|ThalassemiaStatus|PresenceOfHeartDisease|
+---+---+------------+--------------------------+------------------+-----------------+----------------------------------+----------------------------------+-----+------------+----------------------+------------+-----------------+----------------------+
| 63|  1|           3|                     145.0|             233.0|              1.0|                                 0|                             150.0|    0|         2.3|                     0|           0|                1|            

In [96]:
transformed_data_2 = vec_assembler1.transform(data_f)
transformed_data_2.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[63.0,1.0,3.0,145...|
|[37.0,1.0,2.0,130...|
|[41.0,0.0,1.0,130...|
|[56.0,1.0,1.0,120...|
|[57.0,0.0,0.0,120...|
+--------------------+
only showing top 5 rows



In [97]:
final_data = transformed_data_2.select("features", "PresenceOfHeartDisease")
final_data.show()

+--------------------+----------------------+
|            features|PresenceOfHeartDisease|
+--------------------+----------------------+
|[63.0,1.0,3.0,145...|                     1|
|[37.0,1.0,2.0,130...|                     1|
|[41.0,0.0,1.0,130...|                     1|
|[56.0,1.0,1.0,120...|                     1|
|[57.0,0.0,0.0,120...|                     1|
|[57.0,1.0,0.0,140...|                     1|
|[56.0,0.0,1.0,140...|                     1|
|[44.0,1.0,1.0,120...|                     1|
|[52.0,1.0,2.0,172...|                     1|
|[57.0,1.0,2.0,150...|                     1|
|[54.0,1.0,0.0,140...|                     1|
|[48.0,0.0,2.0,130...|                     1|
|[49.0,1.0,1.0,130...|                     1|
|[64.0,1.0,3.0,110...|                     1|
|[58.0,0.0,3.0,150...|                     1|
|[50.0,0.0,2.0,120...|                     1|
|[58.0,0.0,2.0,120...|                     1|
|[66.0,0.0,3.0,150...|                     1|
|[43.0,1.0,0.0,150...|            

In [106]:
train, test = final_data.randomSplit([0.7,0.3])

In [107]:
train.show()

+--------------------+----------------------+
|            features|PresenceOfHeartDisease|
+--------------------+----------------------+
|(13,[0,1,3,4,7,10...|                     1|
|(13,[0,1,3,4,7,10...|                     1|
|(13,[0,2,3,4,7,10...|                     1|
|(13,[0,2,3,4,7,10...|                     1|
|(13,[0,3,4,6,7,10...|                     1|
|(13,[0,3,4,7,8,10...|                     1|
|(13,[0,3,4,7,9,10...|                     1|
|(13,[0,3,4,7,10,1...|                     1|
|(13,[0,3,4,7,10,1...|                     1|
|[29.0,1.0,1.0,130...|                     1|
|[34.0,0.0,1.0,118...|                     1|
|[34.0,1.0,3.0,118...|                     1|
|[35.0,0.0,0.0,138...|                     1|
|[35.0,1.0,0.0,126...|                     0|
|[35.0,1.0,1.0,122...|                     1|
|[37.0,0.0,2.0,120...|                     1|
|[37.0,1.0,2.0,130...|                     1|
|[38.0,1.0,2.0,138...|                     1|
|[38.0,1.0,2.0,138...|            

In [108]:
test.show()

+--------------------+----------------------+
|            features|PresenceOfHeartDisease|
+--------------------+----------------------+
|(13,[0,1,3,4,7,10...|                     0|
|(13,[0,1,3,4,7,10...|                     1|
|(13,[0,2,3,4,7,10...|                     1|
|(13,[0,3,4,6,7,10...|                     1|
|(13,[0,3,4,7,9,10...|                     1|
|(13,[0,3,4,7,9,10...|                     1|
|(13,[0,3,4,7,9,10...|                     1|
|(13,[0,3,4,7,9,11...|                     0|
|(13,[0,3,4,7,9,11...|                     0|
|(13,[0,3,4,7,10,1...|                     1|
|(13,[0,3,4,7,10,1...|                     0|
|[35.0,1.0,0.0,120...|                     0|
|[39.0,1.0,0.0,118...|                     0|
|[40.0,1.0,0.0,110...|                     0|
|[41.0,0.0,1.0,105...|                     1|
|[41.0,1.0,2.0,130...|                     1|
|[42.0,1.0,1.0,120...|                     1|
|[42.0,1.0,2.0,120...|                     1|
|[42.0,1.0,2.0,130...|            

In [112]:
predict_train=model.transform(train)
predict_test=model.transform(test)
predict_test.show()


+--------------------+----------------------+--------------------+--------------------+----------+
|            features|PresenceOfHeartDisease|       rawPrediction|         probability|prediction|
+--------------------+----------------------+--------------------+--------------------+----------+
|(13,[0,1,3,4,7,10...|                     0|[-0.6768254723531...|[0.33697019215557...|       1.0|
|(13,[0,1,3,4,7,10...|                     1|[-1.6794745152544...|[0.15716506426595...|       1.0|
|(13,[0,2,3,4,7,10...|                     1|[-4.4147733067185...|[0.01195270082784...|       1.0|
|(13,[0,3,4,6,7,10...|                     1|[-3.3707062302518...|[0.03322361737960...|       1.0|
|(13,[0,3,4,7,9,10...|                     1|[-1.3166849962565...|[0.21137035243306...|       1.0|
|(13,[0,3,4,7,9,10...|                     1|[-1.3174613012837...|[0.21124097687118...|       1.0|
|(13,[0,3,4,7,9,10...|                     1|[-1.1397068606784...|[0.24237418579389...|       1.0|
|(13,[0,3,

In [113]:
predict_test.select("PresenceOfHeartDisease","prediction").show(10)

+----------------------+----------+
|PresenceOfHeartDisease|prediction|
+----------------------+----------+
|                     0|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     0|       0.0|
|                     0|       0.0|
|                     1|       1.0|
+----------------------+----------+
only showing top 10 rows

