## Installing and Importing

In [2]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 0 B/3,626 B 0%] [Co0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcont                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcont                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
0% [Waiting for headers] [3 InRelease 2,588 B/110 kB 2%] [Connected to ppa.laun                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 http://archive.ubuntu.com/

In [3]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [5]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

## Reading CSV and Exploring Data

In [19]:
#  Import and read the Heart_Disease_Prediction.csv
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/AleidvdZ/Project4HeartDisease/main/Heart_Disease_Prediction.csv"
spark.sparkContext.addFile(url)
hd_df = spark.read.csv(SparkFiles.get("Heart_Disease_Prediction.csv"), sep=",", header=True)

# Show DataFrame
hd_df.show()

+-----+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|index|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels fluro|Thallium|Heart Disease|
+-----+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|    0| 70|  1|              4|130|        322|           0|          2|   109|              0|          2.4|          2|                      3|       3|     Presence|
|    1| 67|  0|              3|115|        564|           0|          2|   160|              0|          1.6|          2|                      0|       7|      Absence|
|    2| 57|  1|              2|124|        261|           0|          0|   141|              0|          0.3|          1|                      0|       7| 

In [10]:
# Show schema
hd_df.printSchema()

root
 |-- index: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Chest pain type: string (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- FBS over 120: string (nullable = true)
 |-- EKG results: string (nullable = true)
 |-- Max HR: string (nullable = true)
 |-- Exercise angina: string (nullable = true)
 |-- ST depression: string (nullable = true)
 |-- Slope of ST: string (nullable = true)
 |-- Number of vessels fluro: string (nullable = true)
 |-- Thallium: string (nullable = true)
 |-- Heart Disease: string (nullable = true)



In [11]:
# Create a temporary view of the DataFrame.
hd_df.createOrReplaceTempView('data')

# Look at data using SparkSQL
spark.sql("select * from data limit 10").show()

+-----+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|index|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels fluro|Thallium|Heart Disease|
+-----+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|    0| 70|  1|              4|130|        322|           0|          2|   109|              0|          2.4|          2|                      3|       3|     Presence|
|    1| 67|  0|              3|115|        564|           0|          2|   160|              0|          1.6|          2|                      0|       7|      Absence|
|    2| 57|  1|              2|124|        261|           0|          0|   141|              0|          0.3|          1|                      0|       7| 

To analyze:

*  Total number of indiviuals
*  Number/percent of individuals of each sex
*  Mean/Median/Mode of ages by sex
*  % of male/female with heart disease

In [22]:
# Total number of indiviuals
spark.sql("""
  SELECT COUNT(index) AS Number_of_Indiv
  FROM data
    """).show()

+---------------+
|Number_of_Indiv|
+---------------+
|            270|
+---------------+



In [34]:
# Mean/Median/Mode of ages (by sex)
spark.sql("""
  SELECT
    MIN(Age) AS min_value,
    MAX(Age) AS max_value,
    ROUND(AVG(Age),1) AS mean_value,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY Age) AS median_value,
    ROUND(STDDEV(Age),1) AS std_deviation,
    COUNT(Age) AS count
FROM data;
    """).show()

+---------+---------+----------+------------+-------------+-----+
|min_value|max_value|mean_value|median_value|std_deviation|count|
+---------+---------+----------+------------+-------------+-----+
|       29|       77|      54.4|        55.0|          9.1|  270|
+---------+---------+----------+------------+-------------+-----+



In [35]:
# Percent of individuals of each sex (0 = female, 1 = male)
spark.sql("""
  SELECT Sex,
  COUNT(*) AS Frequency
  FROM data
  GROUP BY Sex
  ORDER BY Frequency DESC
  """).show()


  # SELECT Sex
  #   (COUNT(*) / (SELECT COUNT(*) FROM data)) * 100 AS Percentage
  #   FROM data
  #   GROUP BY Sex
  #   ORDER BY Percentage DESC


+---+---------+
|Sex|Frequency|
+---+---------+
|  1|      183|
|  0|       87|
+---+---------+



## Preprocessing Data

Rename columns  
Convert Strings to Integers (all columns)  
Heart Disease or Not: presence = 1, absence = 0  
Scaling: Age, BP, Cholesterol, MaxHR, StDep  
Dummy (more than 2 categories): Chest Pain, EKG Result, Slope of ST, Thalium  

## Machine Learning Model - Neural Network

In [None]:
# Remove Heart Disease target from features data (NOTE: NEED TO UPDATE COLUMNS)
y = hd_df.Heart_Disease.values
X = hd_df.drop(columns="Heart_Disease").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 3

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")