In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import when

In [None]:
spark = SparkSession.builder.appName(
    "HeartDiseaseClassification"
).getOrCreate()

In [None]:
# Read the data
df = spark.read.option(
    "delimiter", " "
).csv('data/heart.dat', inferSchema=True, header=False)

In [None]:
df.show(5)

In [None]:
# Rename the columns
new_column_names = [
    'year',
    'sex',
    'tPain',
    'restPressure', 
    'colesterol',
    'bloodSugarL120', 
    'electrocardioRest',
    'maxHeartRate', 
    'angina',
    'oldPeak',
    'stSlope', 
    'numVessels',
    'thal'
]
for i in range(len(new_column_names)):
    df = df.withColumnRenamed(df.columns[i], new_column_names[i])
df = df.drop(df.columns[-1])
df.show(5)

In [None]:
df.dtypes

In [None]:
df = df.withColumn(
    'sick',
    when((df['thal'] == 3) | (df['thal'] == 6), 0).otherwise(1)
)
df.show(5)

In [None]:
df = df.transform(
    lambda df: df.withColumn(
        'newSick',
        when((df['thal'] == 3) | (df['thal'] == 6), 0).otherwise(1))
)

df.show(5)


In [None]:
# drop newSick column
df = df.drop('newSick')
df.show(5)

In [None]:
# Assemble all the features into a single vector
assembler = VectorAssembler(
    inputCols=[
        'year', 'sex', 'tPain', 'restPressure',
        'colesterol', 'bloodSugarL120', 'electrocardioRest',
        'maxHeartRate', 'angina', 'oldPeak', 'stSlope',
        'numVessels'
    ],
    outputCol='features'
)

df = assembler.transform(df)

In [None]:
df.show(5)

In [None]:
# Select only the 'features' and 'new_column' for the model
model_data = df.select('features', 'sick')

# Rename 'new_column' to 'label' as required by MLlib
model_data = model_data.withColumnRenamed('sick', 'label')

# Split the data into training and test sets
train_data, test_data = model_data.randomSplit([0.7, 0.3])

In [None]:
# Create a Logistic Regression model and fit it to the training data
lr = LogisticRegression()
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Show some predictions
predictions.show()

In [None]:
spark.stop()