## Importing Dependencies

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer



In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("HeartDiseasePrediction") \
    .getOrCreate()

print(f"Number of Cores used by Spark: {spark.sparkContext.defaultParallelism}")

Number of Cores used by Pyspark: 2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
file_path = "/content/drive/MyDrive/Patients Data ( Used for Heart Disease Prediction ) - Copy.xlsx"
data_pandas = pd.read_excel(file_path, engine='openpyxl')

In [5]:
data = spark.createDataFrame(data_pandas)


In [6]:
data.printSchema()

root
 |-- PatientID: long (nullable = true)
 |-- State: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- GeneralHealth: string (nullable = true)
 |-- AgeCategory: string (nullable = true)
 |-- HeightInMeters: double (nullable = true)
 |-- WeightInKilograms: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- HadHeartAttack: long (nullable = true)
 |-- HadAngina: long (nullable = true)
 |-- HadStroke: long (nullable = true)
 |-- HadAsthma: long (nullable = true)
 |-- HadSkinCancer: long (nullable = true)
 |-- HadCOPD: long (nullable = true)
 |-- HadDepressiveDisorder: long (nullable = true)
 |-- HadKidneyDisease: long (nullable = true)
 |-- HadArthritis: long (nullable = true)
 |-- HadDiabetes: string (nullable = true)
 |-- DeafOrHardOfHearing: long (nullable = true)
 |-- BlindOrVisionDifficulty: long (nullable = true)
 |-- DifficultyConcentrating: long (nullable = true)
 |-- DifficultyWalking: long (nullable = true)
 |-- DifficultyDressingBathing: long (nul

In [7]:
data = data.dropna()

In [8]:
class_counts = data.groupBy('HadHeartAttack').count().show()
print(class_counts)

+--------------+-----+
|HadHeartAttack|count|
+--------------+-----+
|             0|35657|
|             1|13201|
+--------------+-----+

None


In [9]:
categorical_cols = ['State', 'Sex', 'GeneralHealth', 'AgeCategory',
                    'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage',
                    'RaceEthnicityCategory', 'TetanusLast10Tdap']
for col in categorical_cols:
    indexed_col = f"{col}_indexed"
    if indexed_col not in data.columns:
        indexer = StringIndexer(inputCol=col, outputCol=indexed_col)
        data = indexer.fit(data).transform(data)



In [10]:
feature_cols = [f"{col}_indexed" if col in categorical_cols else col for col in data.columns if col != 'HadHeartAttack']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)


## Splitting Data

In [11]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)


print(f"Training Data Count: {train_data.count()}")
print(f"Test Data Count: {test_data.count()}")

Training Data Count: 39218
Test Data Count: 9640


In [12]:
lr = LogisticRegression(featuresCol="features", labelCol="HadHeartAttack")
lr_model = lr.fit(train_data)

print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

Coefficients: [-5.075665247872051e-05,0.12921919912524055,-0.4327237346224876,0.0059194748651298194,-0.049367415455473546,-1.7517912794591992,0.0025277939894454964,-0.004093212417761397,2.594518529816419,0.8906473998504293,-0.06125032749273623,-0.09095951872154615,0.023255501972868235,0.10758589533490759,0.28043227788844605,0.10141531309160733,0.08951921253259333,0.10809149340423051,0.1625267407652095,0.05094708541095427,0.2780032215601072,0.08842184242752836,0.3003339801796646,0.12989257108781943,0.0008919310471812723,0.7311586928975048,-0.030866648691675883,-0.2538492495947612,-0.05006290583431754,-0.10431804236862767,0.07928569419067168,-0.044606814750431115,0.01406057166204923,0.02231618245753884,0.12921919912524055,-0.4327237346224876,0.0059194748651298194,-0.049367415455473546,0.08951921253259333,0.12989257108781943,0.0008919310471812723,-0.030866648691675883,-0.044606814750431115]
Intercept: 10.141319172666641


In [13]:
# Step 7: Evaluate Model
predictions = lr_model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="HadHeartAttack")
auc = evaluator.evaluate(predictions)
print(f"Test AUC: {auc:.2f}")

Test AUC: 0.98


In [14]:
predictions.select("features", "HadHeartAttack", "prediction").show(10)


+--------------------+--------------+----------+
|            features|HadHeartAttack|prediction|
+--------------------+--------------+----------+
|(43,[0,1,3,4,5,6,...|             1|       1.0|
|(43,[0,1,2,3,4,5,...|             1|       1.0|
|(43,[0,1,3,4,5,6,...|             1|       1.0|
|(43,[0,1,2,4,5,6,...|             1|       1.0|
|(43,[0,1,3,5,6,7,...|             1|       1.0|
|(43,[0,1,2,3,4,5,...|             1|       1.0|
|(43,[0,1,3,4,5,6,...|             1|       1.0|
|(43,[0,1,3,4,5,6,...|             1|       1.0|
|(43,[0,1,3,5,6,7,...|             1|       1.0|
|(43,[0,1,3,5,6,7,...|             1|       1.0|
+--------------------+--------------+----------+
only showing top 10 rows

