In [1]:
# Importing the SparkSession module from the pyspark.sql package
from pyspark.sql import SparkSession

# Importing specific functions like col, when, count from the pyspark.sql.functions module
from pyspark.sql.functions import col, when, count

# Importing the VectorAssembler module from the pyspark.ml.feature package
from pyspark.ml.feature import VectorAssembler

# Importing the MinMaxScaler module from the pyspark.ml.feature package
from pyspark.ml.feature import StandardScaler

# importing classification libraries
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

# importing classification metrices
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [2]:
# Creating a SparkSession with a specified application name "Classification Model" using the SparkSession.builder method
spark = SparkSession.builder.appName("Classification Model").getOrCreate()

# Reading a CSV file "diabetes.csv" into a DataFrame named "dataset" with the first row as header
dataset = spark.read.csv("./diabetes.csv",header=True)

In [3]:
dataset.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [4]:
dataset.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [5]:
column_names = dataset.columns
dataset = dataset.select(*[col(cols).cast('float').alias(cols) if cols in column_names else col(cols) for cols in dataset.columns])

In [6]:
dataset.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [7]:
# display the total null values in the features
null_values = dataset.select([count(when(col(c).isNull(),c)).alias(c) for c in dataset.columns])
null_values.show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [8]:
new_data = dataset.drop("Outcome")
new_data.show()

# converting input features as vectors
assembler = VectorAssembler(inputCols=new_data.columns, outputCol="features")
output = assembler.transform(dataset)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|
|        5.0|  116.0|         74.0|          0.0|    0.0|25.6|                   0.201|30.0|
|        3.0|   78.0|         50.0|         32.0|   88.0|31.0|                   0.248|26.0|
|       10.0|  115.0|          0.0|          0.0|    0.0|35.3|        

In [9]:
output.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
|        5.0|  116.0|   

In [10]:
data = output.select("features","Outcome")
data.show()

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|    1.0|
|[1.0,85.0,66.0,29...|    0.0|
|[8.0,183.0,64.0,0...|    1.0|
|[1.0,89.0,66.0,23...|    0.0|
|[0.0,137.0,40.0,3...|    1.0|
|[5.0,116.0,74.0,0...|    0.0|
|[3.0,78.0,50.0,32...|    1.0|
|[10.0,115.0,0.0,0...|    0.0|
|[2.0,197.0,70.0,4...|    1.0|
|[8.0,125.0,96.0,0...|    1.0|
|[4.0,110.0,92.0,0...|    0.0|
|[10.0,168.0,74.0,...|    1.0|
|[10.0,139.0,80.0,...|    0.0|
|[1.0,189.0,60.0,2...|    1.0|
|[5.0,166.0,72.0,1...|    1.0|
|[7.0,100.0,0.0,0....|    1.0|
|[0.0,118.0,84.0,4...|    1.0|
|[7.0,107.0,74.0,0...|    1.0|
|[1.0,103.0,30.0,3...|    0.0|
|[1.0,115.0,70.0,3...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [11]:
# feature scaling
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(data)
scaledData = scalerModel.transform(data)

In [12]:
scaledData.show()

+--------------------+-------+--------------------+
|            features|Outcome|      scaledFeatures|
+--------------------+-------+--------------------+
|[6.0,148.0,72.0,3...|    1.0|[1.78063837321943...|
|[1.0,85.0,66.0,29...|    0.0|[0.29677306220323...|
|[8.0,183.0,64.0,0...|    1.0|[2.37418449762590...|
|[1.0,89.0,66.0,23...|    0.0|[0.29677306220323...|
|[0.0,137.0,40.0,3...|    1.0|[0.0,4.2849165233...|
|[5.0,116.0,74.0,0...|    0.0|[1.48386531101619...|
|[3.0,78.0,50.0,32...|    1.0|[0.89031918660971...|
|[10.0,115.0,0.0,0...|    0.0|[2.96773062203238...|
|[2.0,197.0,70.0,4...|    1.0|[0.59354612440647...|
|[8.0,125.0,96.0,0...|    1.0|[2.37418449762590...|
|[4.0,110.0,92.0,0...|    0.0|[1.18709224881295...|
|[10.0,168.0,74.0,...|    1.0|[2.96773062203238...|
|[10.0,139.0,80.0,...|    0.0|[2.96773062203238...|
|[1.0,189.0,60.0,2...|    1.0|[0.29677306220323...|
|[5.0,166.0,72.0,1...|    1.0|[1.48386531101619...|
|[7.0,100.0,0.0,0....|    1.0|[2.07741143542266...|
|[0.0,118.0,

In [13]:
final_data = scaledData.select("scaledFeatures","Outcome")
final_data.show()

+--------------------+-------+
|      scaledFeatures|Outcome|
+--------------------+-------+
|[1.78063837321943...|    1.0|
|[0.29677306220323...|    0.0|
|[2.37418449762590...|    1.0|
|[0.29677306220323...|    0.0|
|[0.0,4.2849165233...|    1.0|
|[1.48386531101619...|    0.0|
|[0.89031918660971...|    1.0|
|[2.96773062203238...|    0.0|
|[0.59354612440647...|    1.0|
|[2.37418449762590...|    1.0|
|[1.18709224881295...|    0.0|
|[2.96773062203238...|    1.0|
|[2.96773062203238...|    0.0|
|[0.29677306220323...|    1.0|
|[1.48386531101619...|    1.0|
|[2.07741143542266...|    1.0|
|[0.0,3.6906580274...|    1.0|
|[2.07741143542266...|    1.0|
|[0.29677306220323...|    0.0|
|[0.29677306220323...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [14]:
final_data.count()

768

In [15]:
# splitting the dataframe into train and test
train,test = final_data.randomSplit([0.80,0.20])

In [17]:
train.count() , test.count()

(626, 142)

In [18]:
train.show()

+--------------------+-------+
|      scaledFeatures|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
|(8,[0,1,6,7],[2.9...|    1.0|
|(8,[1,5,6,7],[3.0...|    0.0|
|(8,[1,5,6,7],[3.6...|    0.0|
|(8,[1,5,6,7],[3.7...|    1.0|
|(8,[1,5,6,7],[4.0...|    1.0|
|(8,[1,5,6,7],[4.3...|    1.0|
|(8,[1,5,6,7],[4.4...|    1.0|
|(8,[1,5,6,7],[5.2...|    1.0|
|(8,[1,6,7],[2.940...|    0.0|
|[0.0,2.0955431172...|    0.0|
|[0.0,2.3144804578...|    0.0|
|[0.0,2.4395875096...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6898016132...|    0.0|
|[0.0,2.8461854279...|    0.0|
+--------------------+-------+
only showing top 20 rows



# LogisticRegression

In [19]:
log_reg = LogisticRegression(labelCol='Outcome',featuresCol='scaledFeatures',maxIter=40)
model = log_reg.fit(train)
log_reg_result = model.transform(test)

In [20]:
log_reg_result.show()

+--------------------+-------+--------------------+--------------------+----------+
|      scaledFeatures|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.8...|    0.0|[4.69011582230359...|[0.99089798547195...|       0.0|
|(8,[1,5,6,7],[2.2...|    0.0|[3.48348612932497...|[0.97021422968870...|       0.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-1.4278041368036...|[0.19344105574367...|       1.0|
|[0.0,1.7827754878...|    0.0|[4.09463786930082...|[0.98361128619422...|       0.0|
|[0.0,2.8461854279...|    0.0|[2.90120478233164...|[0.94790596148790...|       0.0|
|[0.0,2.9087389538...|    0.0|[1.50601663512985...|[0.81847012193783...|       0.0|
|[0.0,3.1589530573...|    0.0|[3.48140054561187...|[0.97015390014310...|       0.0|
|[0.0,3.1902298203...|    0.0|[3.20220091283643...|[0.96091701749334...|       0.0|
|[0.0,3.1902298203...|    0.0|[1.42678053226126...|[0.80639918970758...|    

In [23]:
prediction_evaluator = MulticlassClassificationEvaluator(labelCol='Outcome',predictionCol='prediction',metricName='accuracy')
score = prediction_evaluator.evaluate(log_reg_result)
print('Score: ',score*100)

Score:  73.23943661971832


In [None]:
decision_tree = DecisionTreeClassifier(labelCol='Outcome',featuresCol='scaledFeatures')
treemodel = decision_tree.fit(train)
treemodel_result = treemodel.transform(test)