In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar -xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
data = spark.read.csv("file:///content/heart_2020_cleaned.csv", header=True, inferSchema=True)

In [None]:
data

DataFrame[HeartDisease: string, BMI: double, Smoking: string, AlcoholDrinking: string, Stroke: string, PhysicalHealth: double, MentalHealth: double, DiffWalking: string, Sex: string, AgeCategory: string, Race: string, Diabetic: string, PhysicalActivity: string, GenHealth: string, SleepTime: double, Asthma: string, KidneyDisease: string, SkinCancer: string]

In [None]:
len(data.columns)

18

In [None]:
data.count()

319795

In [None]:
data.columns

['HeartDisease',
 'BMI',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'PhysicalHealth',
 'MentalHealth',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'SleepTime',
 'Asthma',
 'KidneyDisease',
 'SkinCancer',
 'HeartDiseaseLabel']

In [None]:
data.printSchema()

root
 |-- HeartDisease: string (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Smoking: string (nullable = true)
 |-- AlcoholDrinking: string (nullable = true)
 |-- Stroke: string (nullable = true)
 |-- PhysicalHealth: double (nullable = true)
 |-- MentalHealth: double (nullable = true)
 |-- DiffWalking: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- AgeCategory: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Diabetic: string (nullable = true)
 |-- PhysicalActivity: string (nullable = true)
 |-- GenHealth: string (nullable = true)
 |-- SleepTime: double (nullable = true)
 |-- Asthma: string (nullable = true)
 |-- KidneyDisease: string (nullable = true)
 |-- SkinCancer: string (nullable = true)



In [None]:
data.show()

+------------+-----+-------+---------------+------+--------------+------------+-----------+------+-----------+-----+--------------------+----------------+---------+---------+------+-------------+----------+
|HeartDisease|  BMI|Smoking|AlcoholDrinking|Stroke|PhysicalHealth|MentalHealth|DiffWalking|   Sex|AgeCategory| Race|            Diabetic|PhysicalActivity|GenHealth|SleepTime|Asthma|KidneyDisease|SkinCancer|
+------------+-----+-------+---------------+------+--------------+------------+-----------+------+-----------+-----+--------------------+----------------+---------+---------+------+-------------+----------+
|          No| 16.6|    Yes|             No|    No|           3.0|        30.0|         No|Female|      55-59|White|                 Yes|             Yes|Very good|      5.0|   Yes|           No|       Yes|
|          No|20.34|     No|             No|   Yes|           0.0|         0.0|         No|Female|80 or older|White|                  No|             Yes|Very good|      7.

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
data.groupBy("HeartDisease").count().show()

+------------+------+
|HeartDisease| count|
+------------+------+
|          No|292422|
|         Yes| 27373|
+------------+------+



In [None]:
def HeartDiseaseLabel(HeartDisease):
    if (HeartDisease=="No"):
        return 0
    else:
        return 1

In [None]:
udf_HeartDiseaseLabel = udf(HeartDiseaseLabel, IntegerType())
data = data.withColumn("HeartDiseaseLabel", udf_HeartDiseaseLabel("HeartDisease"))

In [None]:
data.select("HeartDisease", "HeartDiseaseLabel").show()

+------------+-----------------+
|HeartDisease|HeartDiseaseLabel|
+------------+-----------------+
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|         Yes|                1|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|         Yes|                1|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
|          No|                0|
+------------+-----------------+
only showing top 20 rows



In [None]:
data.groupBy("HeartDiseaseLabel").count().show()

+-----------------+------+
|HeartDiseaseLabel| count|
+-----------------+------+
|                1| 27373|
|                0|292422|
+-----------------+------+



In [None]:
data.groupBy("BMI").count().show(n=100000, truncate=False)

+-----+-----+
|BMI  |count|
+-----+-----+
|17.56|19   |
|17.95|27   |
|23.04|52   |
|26.7 |78   |
|26.72|67   |
|43.03|3    |
|45.43|12   |
|30.49|2    |
|75.29|2    |
|38.61|13   |
|14.9 |4    |
|65.78|3    |
|58.92|1    |
|41.42|5    |
|35.17|1    |
|60.98|1    |
|40.11|3    |
|64.2 |3    |
|49.8 |10   |
|73.73|1    |
|53.82|1    |
|40.94|7    |
|24.19|104  |
|23.06|786  |
|16.54|4    |
|37.23|103  |
|45.49|36   |
|19.27|34   |
|15.92|1    |
|47.5 |22   |
|46.62|1    |
|30.79|477  |
|24.96|1653 |
|18.88|343  |
|30.81|377  |
|24.03|1503 |
|21.52|685  |
|28.23|38   |
|18.78|38   |
|27.91|60   |
|36.27|4    |
|51.68|14   |
|18.62|32   |
|21.89|39   |
|20.21|7    |
|37.1 |9    |
|14.75|5    |
|29.47|19   |
|63.27|2    |
|55.61|2    |
|21.72|1    |
|60.06|1    |
|25.1 |2262 |
|35.15|519  |
|31.31|187  |
|32.37|32   |
|27.53|56   |
|23.69|280  |
|28.08|86   |
|22.83|100  |
|29.15|50   |
|46.22|15   |
|43.94|44   |
|47.12|1    |
|29.74|1    |
|41.25|23   |
|49.13|20   |
|44.87|5    |
|44.99

In [None]:
def BMILabel(BMI):
    if (BMI<18.5):
        return 0
    elif (BMI>=18.5 and BMI<23):
        return 1
    elif (BMI>=23 and BMI<25):
        return 2    
    else:
        return 3

In [None]:
udf_BMILabel = udf(BMILabel, IntegerType())
data = data.withColumn("BMILabel", udf_BMILabel("BMI"))

In [None]:
data.select("BMI", "BMILabel").show()

+-----+--------+
|  BMI|BMILabel|
+-----+--------+
| 16.6|       0|
|20.34|       1|
|26.58|       3|
|24.21|       2|
|23.71|       2|
|28.87|       3|
|21.63|       1|
|31.64|       3|
|26.45|       3|
|40.69|       3|
| 34.3|       3|
|28.71|       3|
|28.37|       3|
|28.15|       3|
|29.29|       3|
|29.18|       3|
|26.26|       3|
|22.59|       1|
|29.86|       3|
|18.13|       0|
+-----+--------+
only showing top 20 rows



In [None]:
data.groupBy("BMILabel").count().show()

+--------+------+
|BMILabel| count|
+--------+------+
|       1| 52724|
|       3|217354|
|       2| 44607|
|       0|  5110|
+--------+------+



In [None]:
data.groupBy("Smoking").count().show()

+-------+------+
|Smoking| count|
+-------+------+
|     No|187887|
|    Yes|131908|
+-------+------+



In [None]:
def SmokingLabel(Smoking):
    if (Smoking=="No"):
        return 0
    else:
        return 1

In [None]:
udf_SmokingLabel = udf(SmokingLabel, IntegerType())
data = data.withColumn("SmokingLabel", udf_SmokingLabel("Smoking"))

In [None]:
data.select("Smoking", "SmokingLabel").show()

+-------+------------+
|Smoking|SmokingLabel|
+-------+------------+
|    Yes|           1|
|     No|           0|
|    Yes|           1|
|     No|           0|
|     No|           0|
|    Yes|           1|
|     No|           0|
|    Yes|           1|
|     No|           0|
|     No|           0|
|    Yes|           1|
|    Yes|           1|
|    Yes|           1|
|     No|           0|
|    Yes|           1|
|     No|           0|
|     No|           0|
|    Yes|           1|
|    Yes|           1|
|     No|           0|
+-------+------------+
only showing top 20 rows



In [None]:
data.groupBy("SmokingLabel").count().show()

+------------+------+
|SmokingLabel| count|
+------------+------+
|           1|131908|
|           0|187887|
+------------+------+



In [None]:
data.groupBy("AlcoholDrinking").count().show()

+---------------+------+
|AlcoholDrinking| count|
+---------------+------+
|             No|298018|
|            Yes| 21777|
+---------------+------+



In [None]:
def AlcoholDrinkingLabel(AlcoholDrinking):
    if (AlcoholDrinking=="No"):
        return 0
    else:
        return 1

In [None]:
udf_AlcoholDrinkingLabel = udf(AlcoholDrinkingLabel, IntegerType())
data = data.withColumn("AlcoholDrinkingLabel", udf_AlcoholDrinkingLabel("AlcoholDrinking"))

In [None]:
data.select("AlcoholDrinking", "AlcoholDrinkingLabel").show()

+---------------+--------------------+
|AlcoholDrinking|AlcoholDrinkingLabel|
+---------------+--------------------+
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
|             No|                   0|
+---------------+--------------------+
only showing top 20 rows



In [None]:
data.groupBy("AlcoholDrinkingLabel").count().show()

+--------------------+------+
|AlcoholDrinkingLabel| count|
+--------------------+------+
|                   1| 21777|
|                   0|298018|
+--------------------+------+



In [None]:
data.groupBy("Stroke").count().show()

+------+------+
|Stroke| count|
+------+------+
|    No|307726|
|   Yes| 12069|
+------+------+



In [None]:
def StrokeLabel(Stroke):
    if (Stroke=="No"):
        return 0
    else:
        return 1

In [None]:
udf_StrokeLabel = udf(StrokeLabel, IntegerType())
data = data.withColumn("StrokeLabel", udf_StrokeLabel("Stroke"))

In [None]:
data.select("Stroke", "StrokeLabel").show()

+------+-----------+
|Stroke|StrokeLabel|
+------+-----------+
|    No|          0|
|   Yes|          1|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
|    No|          0|
+------+-----------+
only showing top 20 rows



In [None]:
data.groupBy("StrokeLabel").count().show()

+-----------+------+
|StrokeLabel| count|
+-----------+------+
|          1| 12069|
|          0|307726|
+-----------+------+



In [None]:
data.groupBy("PhysicalHealth").count().show(n=31)

+--------------+------+
|PhysicalHealth| count|
+--------------+------+
|           8.0|   924|
|           0.0|226589|
|           7.0|  4629|
|          29.0|   204|
|          18.0|   167|
|           1.0| 10489|
|          25.0|  1164|
|           4.0|  4468|
|          23.0|    46|
|          11.0|    85|
|          21.0|   626|
|          14.0|  2893|
|          22.0|    89|
|           3.0|  8617|
|          19.0|    35|
|          28.0|   446|
|           2.0| 14880|
|          17.0|   110|
|          27.0|   124|
|          10.0|  5453|
|          30.0| 19509|
|          13.0|    91|
|           6.0|  1270|
|          20.0|  3216|
|          15.0|  5012|
|           5.0|  7606|
|          24.0|    67|
|          26.0|    66|
|           9.0|   180|
|          16.0|   135|
|          12.0|   605|
+--------------+------+



In [None]:
def PhysicalHealthLabel(PhysicalHealth):
    if (PhysicalHealth==0):
        return 0
    elif (PhysicalHealth>0 and PhysicalHealth<=10):
        return 1
    elif (PhysicalHealth>10 and PhysicalHealth<=20):
        return 2
    else:
        return 3

In [None]:
udf_PhysicalHealthLabel = udf(PhysicalHealthLabel, IntegerType())
data = data.withColumn("PhysicalHealthLabel", udf_PhysicalHealthLabel("PhysicalHealth"))

In [None]:
data.select("PhysicalHealth", "PhysicalHealthLabel").show()

+--------------+-------------------+
|PhysicalHealth|PhysicalHealthLabel|
+--------------+-------------------+
|           3.0|                  1|
|           0.0|                  0|
|          20.0|                  2|
|           0.0|                  0|
|          28.0|                  3|
|           6.0|                  1|
|          15.0|                  2|
|           5.0|                  1|
|           0.0|                  0|
|           0.0|                  0|
|          30.0|                  3|
|           0.0|                  0|
|           0.0|                  0|
|           7.0|                  1|
|           0.0|                  0|
|           1.0|                  1|
|           5.0|                  1|
|           0.0|                  0|
|           0.0|                  0|
|           0.0|                  0|
+--------------+-------------------+
only showing top 20 rows



In [None]:
data.groupBy("PhysicalHealthLabel").count().show()

+-------------------+------+
|PhysicalHealthLabel| count|
+-------------------+------+
|                  1| 58516|
|                  3| 22341|
|                  2| 12349|
|                  0|226589|
+-------------------+------+



In [105]:
data.groupBy("MentalHealth").count().show(n=31)

+------------+------+
|MentalHealth| count|
+------------+------+
|         8.0|  1094|
|         0.0|205401|
|         7.0|  5528|
|        29.0|   317|
|        18.0|   211|
|         1.0|  9291|
|        25.0|  1954|
|         4.0|  5379|
|        23.0|    68|
|        11.0|    83|
|        21.0|   352|
|        14.0|  2048|
|        22.0|    98|
|         3.0| 10466|
|        19.0|    21|
|        28.0|   515|
|         2.0| 16495|
|        17.0|   128|
|        27.0|   126|
|        10.0| 10513|
|        30.0| 17373|
|        13.0|   110|
|         6.0|  1510|
|        20.0|  5431|
|         5.0| 14149|
|        15.0|  9896|
|        24.0|    67|
|         9.0|   203|
|        26.0|    59|
|        16.0|   152|
|        12.0|   757|
+------------+------+



In [106]:
def MentalHealthLabel(MentalHealth):
    if (MentalHealth==0):
        return 0
    elif (MentalHealth>0 and MentalHealth<=10):
        return 1
    elif (MentalHealth>10 and MentalHealth<=20):
        return 2
    else:
        return 3

In [108]:
udf_MentalHealthLabel = udf(MentalHealthLabel, IntegerType())
data = data.withColumn("MentalHealthLabel", udf_MentalHealthLabel("MentalHealth"))

In [109]:
data.select("MentalHealth", "MentalHealthLabel").show()

+------------+-----------------+
|MentalHealth|MentalHealthLabel|
+------------+-----------------+
|        30.0|                3|
|         0.0|                0|
|        30.0|                3|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|         0.0|                0|
|        30.0|                3|
|         0.0|                0|
|         2.0|                1|
|        30.0|                3|
|         0.0|                0|
|         0.0|                0|
+------------+-----------------+
only showing top 20 rows



In [112]:
data.groupBy("MentalHealthLabel").count().show()

+-----------------+------+
|MentalHealthLabel| count|
+-----------------+------+
|                1| 74628|
|                3| 20929|
|                2| 18837|
|                0|205401|
+-----------------+------+



In [113]:
data.groupBy("DiffWalking").count().show()

+-----------+------+
|DiffWalking| count|
+-----------+------+
|         No|275385|
|        Yes| 44410|
+-----------+------+



In [114]:
def DiffWalkingLabel(DiffWalking):
    if (DiffWalking=="No"):
        return 0
    else:
        return 1

In [115]:
udf_DiffWalkingLabel = udf(DiffWalkingLabel, IntegerType())
data = data.withColumn("DiffWalkingLabel", udf_DiffWalkingLabel("DiffWalking"))

In [117]:
data.select("DiffWalking", "DiffWalkingLabel").show()

+-----------+----------------+
|DiffWalking|DiffWalkingLabel|
+-----------+----------------+
|         No|               0|
|         No|               0|
|         No|               0|
|         No|               0|
|        Yes|               1|
|        Yes|               1|
|         No|               0|
|        Yes|               1|
|         No|               0|
|        Yes|               1|
|        Yes|               1|
|         No|               0|
|        Yes|               1|
|        Yes|               1|
|        Yes|               1|
|         No|               0|
|         No|               0|
|        Yes|               1|
|        Yes|               1|
|         No|               0|
+-----------+----------------+
only showing top 20 rows



In [118]:
data.groupBy("DiffWalkingLabel").count().show()

+----------------+------+
|DiffWalkingLabel| count|
+----------------+------+
|               1| 44410|
|               0|275385|
+----------------+------+



In [119]:
data.groupBy("Sex").count().show()

+------+------+
|   Sex| count|
+------+------+
|Female|167805|
|  Male|151990|
+------+------+



In [120]:
def SexLabel(Sex):
    if (Sex=="Female"):
        return 0
    else:
        return 1

In [121]:
udf_SexLabel = udf(SexLabel, IntegerType())
data = data.withColumn("SexLabel", udf_SexLabel("Sex"))

In [122]:
data.select("Sex", "SexLabel").show()

+------+--------+
|   Sex|SexLabel|
+------+--------+
|Female|       0|
|Female|       0|
|  Male|       1|
|Female|       0|
|Female|       0|
|Female|       0|
|Female|       0|
|Female|       0|
|Female|       0|
|  Male|       1|
|  Male|       1|
|Female|       0|
|  Male|       1|
|Female|       0|
|Female|       0|
|Female|       0|
|Female|       0|
|  Male|       1|
|Female|       0|
|  Male|       1|
+------+--------+
only showing top 20 rows



In [123]:
data.groupBy("SexLabel").count().show()

+--------+------+
|SexLabel| count|
+--------+------+
|       1|151990|
|       0|167805|
+--------+------+



In [125]:
data.groupBy("AgeCategory").count().show()

+-----------+-----+
|AgeCategory|count|
+-----------+-----+
|      65-69|34151|
|      75-79|21482|
|80 or older|24153|
|      30-34|18753|
|      70-74|31065|
|      18-24|21064|
|      50-54|25382|
|      35-39|20550|
|      45-49|21791|
|      25-29|16955|
|      40-44|21006|
|      55-59|29757|
|      60-64|33686|
+-----------+-----+



In [127]:
def AgeCategoryLabel(AgeCategory):
    if (AgeCategory=="18-24"):
        return 1
    elif (AgeCategory=="25-29"):
        return 2
    elif (AgeCategory=="30-34"):
        return 3
    elif (AgeCategory=="35-39"):
        return 4
    elif (AgeCategory=="40-44"):
        return 5
    elif (AgeCategory=="45-49"):
        return 6
    elif (AgeCategory=="50-54"):
        return 7
    elif (AgeCategory=="55-59"):
        return 8
    elif (AgeCategory=="60-64"):
        return 9
    elif (AgeCategory=="65-69"):
        return 10
    elif (AgeCategory=="70-74"):
        return 11
    elif (AgeCategory=="75-79"):
        return 12    
    else:
        return 13

In [128]:
udf_AgeCategoryLabel = udf(AgeCategoryLabel, IntegerType())
data = data.withColumn("AgeCategoryLabel", udf_AgeCategoryLabel("AgeCategory"))

In [129]:
data.select("AgeCategory", "AgeCategoryLabel").show()

+-----------+----------------+
|AgeCategory|AgeCategoryLabel|
+-----------+----------------+
|      55-59|               8|
|80 or older|              13|
|      65-69|              10|
|      75-79|              12|
|      40-44|               5|
|      75-79|              12|
|      70-74|              11|
|80 or older|              13|
|80 or older|              13|
|      65-69|              10|
|      60-64|               9|
|      55-59|               8|
|      75-79|              12|
|80 or older|              13|
|      60-64|               9|
|      50-54|               7|
|      70-74|              11|
|      70-74|              11|
|      75-79|              12|
|80 or older|              13|
+-----------+----------------+
only showing top 20 rows



In [130]:
data.groupBy("AgeCategoryLabel").count().show()

+----------------+-----+
|AgeCategoryLabel|count|
+----------------+-----+
|              12|21482|
|               1|21064|
|              13|24153|
|               6|21791|
|               3|18753|
|               5|21006|
|               9|33686|
|               4|20550|
|               8|29757|
|               7|25382|
|              10|34151|
|              11|31065|
|               2|16955|
+----------------+-----+



In [131]:
data.groupBy("Race").count().show()

+--------------------+------+
|                Race| count|
+--------------------+------+
|American Indian/A...|  5202|
|               Other| 10928|
|               White|245212|
|               Black| 22939|
|            Hispanic| 27446|
|               Asian|  8068|
+--------------------+------+



In [132]:
def RaceLabel(Race):
    if (Race=="Asian"):
        return 1
    elif (Race=="Black"):
        return 2
    elif (Race=="White"):
        return 3
    elif (Race=="Hispanic"):
        return 4
    elif (Race=="American Indian/Alaskan Native"):
        return 5
    else:
        return 0

In [133]:
udf_RaceLabel = udf(RaceLabel, IntegerType())
data = data.withColumn("RaceLabel", udf_RaceLabel("Race"))

In [134]:
data.select("Race", "RaceLabel").show()

+-----+---------+
| Race|RaceLabel|
+-----+---------+
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|Black|        2|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|White|        3|
|Black|        2|
|White|        3|
+-----+---------+
only showing top 20 rows



In [135]:
data.groupBy("RaceLabel").count().show()

+---------+------+
|RaceLabel| count|
+---------+------+
|        1|  8068|
|        3|245212|
|        5|  5202|
|        4| 27446|
|        2| 22939|
|        0| 10928|
+---------+------+



In [136]:
data.groupBy("Diabetic").count().show()

+--------------------+------+
|            Diabetic| count|
+--------------------+------+
|Yes (during pregn...|  2559|
|No, borderline di...|  6781|
|                  No|269653|
|                 Yes| 40802|
+--------------------+------+



In [137]:
def DiabeticLabel(Diabetic):
    if (Diabetic=="No"):
        return 0
    elif (Diabetic=="No, borderline diabetes"):
        return 1
    elif (Diabetic=="Yes (during pregnancy)"):
        return 2
    else:
        return 3

In [138]:
udf_DiabeticLabel = udf(DiabeticLabel, IntegerType())
data = data.withColumn("DiabeticLabel", udf_DiabeticLabel("Diabetic"))

In [139]:
data.select("Diabetic", "DiabeticLabel").show()

+--------------------+-------------+
|            Diabetic|DiabeticLabel|
+--------------------+-------------+
|                 Yes|            3|
|                  No|            0|
|                 Yes|            3|
|                  No|            0|
|                  No|            0|
|                  No|            0|
|                  No|            0|
|                 Yes|            3|
|No, borderline di...|            1|
|                  No|            0|
|                 Yes|            3|
|                  No|            0|
|                 Yes|            3|
|                  No|            0|
|                  No|            0|
|                  No|            0|
|                  No|            0|
|No, borderline di...|            1|
|                 Yes|            3|
|                  No|            0|
+--------------------+-------------+
only showing top 20 rows



In [140]:
data.groupBy("DiabeticLabel").count().show()

+-------------+------+
|DiabeticLabel| count|
+-------------+------+
|            1|  6781|
|            3| 40802|
|            2|  2559|
|            0|269653|
+-------------+------+



In [141]:
data.groupBy("PhysicalActivity").count().show()

+----------------+------+
|PhysicalActivity| count|
+----------------+------+
|              No| 71838|
|             Yes|247957|
+----------------+------+



In [142]:
def PhysicalActivityLabel(PhysicalActivity):
    if (PhysicalActivity=="No"):
        return 0
    else:
        return 1

In [143]:
udf_PhysicalActivityLabel = udf(PhysicalActivityLabel, IntegerType())
data = data.withColumn("PhysicalActivityLabel", udf_PhysicalActivityLabel("PhysicalActivity"))

In [144]:
data.select("PhysicalActivity", "PhysicalActivityLabel").show()

+----------------+---------------------+
|PhysicalActivity|PhysicalActivityLabel|
+----------------+---------------------+
|             Yes|                    1|
|             Yes|                    1|
|             Yes|                    1|
|              No|                    0|
|             Yes|                    1|
|              No|                    0|
|             Yes|                    1|
|              No|                    0|
|              No|                    0|
|             Yes|                    1|
|              No|                    0|
|             Yes|                    1|
|             Yes|                    1|
|              No|                    0|
|              No|                    0|
|             Yes|                    1|
|              No|                    0|
|             Yes|                    1|
|              No|                    0|
|             Yes|                    1|
+----------------+---------------------+
only showing top

In [145]:
data.groupBy("PhysicalActivityLabel").count().show()

+---------------------+------+
|PhysicalActivityLabel| count|
+---------------------+------+
|                    1|247957|
|                    0| 71838|
+---------------------+------+



In [146]:
data.groupBy("GenHealth").count().show()

+---------+------+
|GenHealth| count|
+---------+------+
|Excellent| 66842|
|     Good| 93129|
|     Fair| 34677|
|Very good|113858|
|     Poor| 11289|
+---------+------+



In [147]:
def GenHealthLabel(GenHealth):
    if (GenHealth=="Poor"):
        return 0
    elif (GenHealth=="Fair"):
        return 1
    elif (GenHealth=="Good"):
        return 2
    elif (GenHealth=="Very good"):
        return 3
    else:
        return 4

In [148]:
udf_GenHealthLabel = udf(GenHealthLabel, IntegerType())
data = data.withColumn("GenHealthLabel", udf_GenHealthLabel("GenHealth"))

In [149]:
data.select("GenHealth", "GenHealthLabel").show()

+---------+--------------+
|GenHealth|GenHealthLabel|
+---------+--------------+
|Very good|             3|
|Very good|             3|
|     Fair|             1|
|     Good|             2|
|Very good|             3|
|     Fair|             1|
|     Fair|             1|
|     Good|             2|
|     Fair|             1|
|     Good|             2|
|     Poor|             0|
|Very good|             3|
|Very good|             3|
|     Good|             2|
|     Good|             2|
|Very good|             3|
|Very good|             3|
|     Good|             2|
|     Fair|             1|
|Excellent|             4|
+---------+--------------+
only showing top 20 rows



In [150]:
data.groupBy("GenHealthLabel").count().show()

+--------------+------+
|GenHealthLabel| count|
+--------------+------+
|             1| 34677|
|             3|113858|
|             4| 66842|
|             2| 93129|
|             0| 11289|
+--------------+------+



In [153]:
data.groupBy("SleepTime").count().show(n=24)

+---------+-----+
|SleepTime|count|
+---------+-----+
|      8.0|97602|
|      7.0|97751|
|     18.0|  102|
|      1.0|  551|
|      4.0| 7750|
|     11.0|  415|
|     21.0|    2|
|     14.0|  243|
|     22.0|    9|
|      3.0| 1992|
|     19.0|    3|
|      2.0|  788|
|     17.0|   21|
|     10.0| 7796|
|     13.0|   97|
|      6.0|66721|
|     20.0|   64|
|      5.0|19184|
|     15.0|  189|
|     24.0|   30|
|      9.0|16041|
|     16.0|  236|
|     12.0| 2205|
|     23.0|    3|
+---------+-----+



In [None]:
def SleepTimeLabel(SleepTime):
    if (SleepTime>=20):
        return 0
    else:
        return 1