In [3]:
import findspark
findspark.init()
import pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [6]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Heart Disease") \
    .getOrCreate()

# Step 1: Read the dataset
input_file = r"D:\Data Science\diabeties\diabetes.csv"
data = spark.read.csv(input_file, header=True, inferSchema=True)


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
     ---------------------------------------- 0.0/317.3 MB ? eta -:--:--
     -------------------------------------- 0.0/317.3 MB 217.9 kB/s eta 0:24:17
     -------------------------------------- 0.0/317.3 MB 326.8 kB/s eta 0:16:11
     -------------------------------------- 0.1/317.3 MB 491.5 kB/s eta 0:10:46
     ---------------------------------------- 0.3/317.3 MB 1.6 MB/s eta 0:03:14
     ---------------------------------------- 0.6/317.3 MB 2.7 MB/s eta 0:01:58
     ---------------------------------------- 0.9/317.3 MB 3.5 MB/s eta 0:01:31
     ---------------------------------------- 1.0/317.3 MB 3.1 MB/s eta 0:01:41
     ---------------------------------------- 1.0/317.3 MB 3.1 MB/s eta 0:01:41
     ---------------------------------------- 1.0/317.3 MB 3.1 MB/s eta 0:01:41
     ---------------------------------------- 1.5/317.3 MB 3.2 MB/s eta 0:01:38
     ---------------------------------------- 1.8/317.3 MB 3.4

In [8]:
data.head()

Row(Pregnancies=6, Glucose=148, BloodPressure=72, SkinThickness=35, Insulin=0, BMI=33.6, DiabetesPedigreeFunction=0.627, Age=50, Outcome=1)

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Show the initial data
print("Initial Data:")
data.show()

# Step 2: Data Preprocessing

# Convert the label column to numerical values
label_indexer = StringIndexer(inputCol="Outcome", outputCol="label")  # Ensure "diagnosis" is your label column

# Drop existing features column if it exists
if "features" in data.columns:
    data = data.drop("features")

# Create feature vector
feature_assembler = VectorAssembler(inputCols=[
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI"
], outputCol="features")

# Create the pipeline
pipeline = Pipeline(stages=[label_indexer, feature_assembler])

# Fit the model (this applies both transformations in the pipeline)
model = pipeline.fit(data)

# Transform the data with the fitted model
data_transformed = model.transform(data)

# Show the transformed data
data_transformed.select("label", "features").show()


Initial Data:
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248

In [10]:
# Step 3: Define the machine learning model
lr = LogisticRegression()

# Create a pipeline
pipeline = Pipeline(stages=[label_indexer, feature_assembler, lr])



In [11]:
# Step 4: Fit the model
model = pipeline.fit(data)

In [12]:
# Step 6: Make predictions
predictions = model.transform(data)


# Show the predictions
print("Predictions:")
predictions.select("features", "label", "prediction").show()

# Stop the Spark session
spark.stop()


Predictions:
+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[148.0,72.0,35.0,...|  1.0|       1.0|
|[85.0,66.0,29.0,0...|  0.0|       0.0|
|[183.0,64.0,0.0,0...|  1.0|       1.0|
|[89.0,66.0,23.0,9...|  0.0|       0.0|
|[137.0,40.0,35.0,...|  1.0|       1.0|
|[116.0,74.0,0.0,0...|  0.0|       0.0|
|[78.0,50.0,32.0,8...|  1.0|       0.0|
|(5,[0,4],[115.0,3...|  0.0|       0.0|
|[197.0,70.0,45.0,...|  1.0|       1.0|
|(5,[0,1],[125.0,9...|  1.0|       0.0|
|[110.0,92.0,0.0,0...|  0.0|       0.0|
|[168.0,74.0,0.0,0...|  1.0|       1.0|
|[139.0,80.0,0.0,0...|  0.0|       0.0|
|[189.0,60.0,23.0,...|  1.0|       1.0|
|[166.0,72.0,19.0,...|  1.0|       1.0|
|(5,[0,4],[100.0,3...|  1.0|       0.0|
|[118.0,84.0,47.0,...|  1.0|       0.0|
|[107.0,74.0,0.0,0...|  1.0|       0.0|
|[103.0,30.0,38.0,...|  0.0|       0.0|
|[115.0,70.0,30.0,...|  1.0|       0.0|
+--------------------+-----+----------+
only showing top 20 rows

