In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
from pyspark.sql.types import StringType , IntegerType , StructField , StructType 

from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.getOrCreate()


In [5]:
df = spark.read.csv('diabetes.csv' ,header = False , inferSchema =True)
df = df.toDF('pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label')

df.show()

+--------+-------+---+----+-------+----+--------+---+-----+
|pregnant|glucose| bp|skin|insulin| bmi|pedigree|age|label|
+--------+-------+---+----+-------+----+--------+---+-----+
|       6|    148| 72|  35|      0|33.6|   0.627| 50|    1|
|       1|     85| 66|  29|      0|26.6|   0.351| 31|    0|
|       8|    183| 64|   0|      0|23.3|   0.672| 32|    1|
|       1|     89| 66|  23|     94|28.1|   0.167| 21|    0|
|       0|    137| 40|  35|    168|43.1|   2.288| 33|    1|
|       5|    116| 74|   0|      0|25.6|   0.201| 30|    0|
|       3|     78| 50|  32|     88|31.0|   0.248| 26|    1|
|      10|    115|  0|   0|      0|35.3|   0.134| 29|    0|
|       2|    197| 70|  45|    543|30.5|   0.158| 53|    1|
|       8|    125| 96|   0|      0| 0.0|   0.232| 54|    1|
|       4|    110| 92|   0|      0|37.6|   0.191| 30|    0|
|      10|    168| 74|   0|      0|38.0|   0.537| 34|    1|
|      10|    139| 80|   0|      0|27.1|   1.441| 57|    0|
|       1|    189| 60|  23|    846|30.1|

In [7]:
df.printSchema()


root
 |-- pregnant: integer (nullable = true)
 |-- glucose: integer (nullable = true)
 |-- bp: integer (nullable = true)
 |-- skin: integer (nullable = true)
 |-- insulin: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- pedigree: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- label: integer (nullable = true)



In [9]:
df = df.na.fill(0)


In [17]:
# Split the data into training and test sets (20% held out for testing)
trainDF, testDF = df.randomSplit([.8,.2],seed=42)
print(f"There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set")

trainX = trainDF.drop("label")
testX = testDF.drop("label")

There are 645 rows in the training set, and 123 in the test set


In [27]:
assembler = VectorAssembler(
    inputCols=['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age'], outputCol="features")
final_df = assembler.transform(df)

vectTrainDF = assembler.transform(trainDF)

final_df.select(['features','label']).show(truncate=False)
final_df.count()

+-------------------------------------------+-----+
|features                                   |label|
+-------------------------------------------+-----+
|[6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0]  |1    |
|[1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0]   |0    |
|[8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0]   |1    |
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]  |0    |
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0]|1    |
|[5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0]   |0    |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0]  |1    |
|[10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0]   |0    |
|[2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0]|1    |
|[8.0,125.0,96.0,0.0,0.0,0.0,0.232,54.0]    |1    |
|[4.0,110.0,92.0,0.0,0.0,37.6,0.191,30.0]   |0    |
|[10.0,168.0,74.0,0.0,0.0,38.0,0.537,34.0]  |1    |
|[10.0,139.0,80.0,0.0,0.0,27.1,1.441,57.0]  |0    |
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0]|1    |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0]|1    |
|[7.0,100.0,0.0,0.0,0.0,30.0,0.484,32.0]    |1    |
|[0.0,118.0,

768

In [28]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

In [29]:
# Train model.  This also runs the indexers.
model = gbt.fit(vectTrainDF)

# Make predictions.
predictions = model.transform(testDF)

# Select example rows to display.
predictions.select("prediction", "features").show(5)

IllegalArgumentException: features does not exist. Available: pregnant, glucose, bp, skin, insulin, bmi, pedigree, age, label

In [31]:
pipeline = Pipeline(stages=[assembler,gbt])
pipelineModel = pipeline.fit(trainDF)

In [35]:
predDF = pipelineModel.transform(testDF)
predDF.select('features','label','prediction').show(15)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(8,[1,5,6,7],[73....|    0|       0.0|
|[0.0,84.0,82.0,31...|    0|       0.0|
|[0.0,91.0,68.0,32...|    0|       0.0|
|(8,[1,6,7],[94.0,...|    0|       0.0|
|[0.0,98.0,82.0,15...|    0|       0.0|
|[0.0,101.0,62.0,0...|    0|       0.0|
|[0.0,102.0,75.0,2...|    0|       0.0|
|[0.0,105.0,64.0,4...|    0|       0.0|
|[0.0,111.0,65.0,0...|    0|       0.0|
|[0.0,113.0,76.0,0...|    1|       0.0|
|[0.0,113.0,80.0,1...|    0|       0.0|
|(8,[1,5,6,7],[117...|    0|       1.0|
|[0.0,117.0,80.0,3...|    0|       0.0|
|[0.0,119.0,64.0,1...|    0|       0.0|
|[0.0,124.0,70.0,2...|    1|       1.0|
+--------------------+-----+----------+
only showing top 15 rows



In [41]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predDF)
print("accuracy is : ",accuracy)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = pipelineModel.stages[1]
print(gbtModel)  # summary only

accuracy is :  0.7479674796747967
Test Error = 0.252033
GBTClassificationModel: uid = GBTClassifier_f0c3319a10e9, numTrees=10, numClasses=2, numFeatures=8
