# Classification Example - Diabetes Dataset

#### Imports and PySpark Init

In [1]:
from pathlib import Path
import sys

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
conf = SparkConf() \
    .setAppName('pySparkExamples') \
    .setMaster('local')

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

### Reading Dataset

In [4]:
PROJECT_ROOT = Path.cwd().parent.as_posix()
csv_file = f"{PROJECT_ROOT}/data/diabetes.csv"
uri_scheme = "file://"

In [5]:
input_path = f"{uri_scheme}/{csv_file}" if sys.platform == "win32" else f"{uri_scheme}{csv_file}"
df = spark.read.csv(input_path, header=True, inferSchema=True)

### Exploring Data

In [6]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [7]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [8]:
# Check null value count
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



### Preprocessing

#### Build Feature Vector

In [9]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

assemblerInputs = df.columns[:-1]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

partialPipeline = Pipeline().setStages([assembler])
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)

#### Train Test Split

In [10]:
dataset = preppedDataDF.select(df.columns + ["features"])
display(dataset)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print("Train Data Count - ", trainingData.count())
print("Test  Data Count - ", testData.count())

DataFrame[Pregnancies: int, Glucose: int, BloodPressure: int, SkinThickness: int, Insulin: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int, features: vector]

Train Data Count -  547
Test  Data Count -  221


### Classification

#### Decision Tree Classifier

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
# Train a DecisionTree model
dt = DecisionTreeClassifier(labelCol="Outcome")

dt_model = dt.fit(trainingData)
dt_predictions = dt_model.transform(testData)

In [13]:
dt_predictions.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
# Select example rows to display.
dt_predictions.select("prediction", "Outcome", "features").show(5)

+----------+-------+--------------------+
|prediction|Outcome|            features|
+----------+-------+--------------------+
|       0.0|      0|[0.0,74.0,52.0,10...|
|       0.0|      0|[0.0,93.0,60.0,0....|
|       0.0|      0|[0.0,93.0,60.0,25...|
|       0.0|      0|[0.0,94.0,70.0,27...|
|       0.0|      0|[0.0,95.0,64.0,39...|
+----------+-------+--------------------+
only showing top 5 rows



In [15]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
dt_accuracy = evaluator.evaluate(dt_predictions)
print("Decision Tree - Test Accuracy = %g" % dt_accuracy)

Decision Tree - Test Accuracy = 0.733032


#### Logistic Regression

In [16]:
from pyspark.ml.classification import LogisticRegression

In [17]:
lr = LogisticRegression(labelCol="Outcome", maxIter=10)

lr_model = lr.fit(trainingData)
lr_predictions = lr_model.transform(testData)

In [18]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print("Decision Tree - Test Accuracy = %g" % lr_accuracy)

Decision Tree - Test Accuracy = 0.773756


#### Random Forest Classifier

In [19]:
from pyspark.ml.classification import RandomForestClassifier

In [20]:
rf = RandomForestClassifier(labelCol="Outcome")

rf_model = rf.fit(trainingData)
rf_predictions = rf_model.transform(testData)

In [21]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
rf_accuracy = evaluator.evaluate(rf_predictions)
print("Decision Tree - Test Accuracy = %g" % rf_accuracy)

Decision Tree - Test Accuracy = 0.791855
