# Diabetes prediction: gradient boosting model with Spark

## 1. Notebooks set-up

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

spark=SparkSession\
    .builder\
    .master('spark://0.0.0.0:7077')\
    .appName('GradientBoostingClassifier')\
    .getOrCreate()

25/03/16 03:49:12 WARN Utils: Your hostname, codespaces-b9e35a resolves to a loopback address: 127.0.0.1; using 10.0.2.237 instead (on interface eth0)
25/03/16 03:49:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/16 03:49:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 2. Data preparation

### 2.1. Data loading

In [2]:
url='https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv'
data_df=pd.read_csv(url)
data_df.drop_duplicates().reset_index(drop=True, inplace=True)
data_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 2.2. Train-test split

In [3]:
training_df, testing_df=train_test_split(
    data_df,
    test_size=0.25,
    random_state=315
)

### 2.3. Imputation of zeros

In [4]:
imputed_features=['Insulin','SkinThickness','BloodPressure','BMI','Glucose']
knn_imputer=KNNImputer(missing_values=0.0, weights='distance')
knn_imputer.fit(training_df[imputed_features])
training_df[imputed_features]=knn_imputer.transform(training_df[imputed_features])
testing_df[imputed_features]=knn_imputer.transform(testing_df[imputed_features])

### 2.4. Convert to Spark dataframe

In [5]:
training_sdf=spark.createDataFrame(training_df)
testing_sdf=spark.createDataFrame(testing_df)

training_sdf.show()

                                                                                

+-----------+------------------+-------------+------------------+------------------+----+------------------------+---+-------+
|Pregnancies|           Glucose|BloodPressure|     SkinThickness|           Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+------------------+-------------+------------------+------------------+----+------------------------+---+-------+
|          4|             110.0|         92.0|30.539537443023107|139.60582970412264|37.6|                   0.191| 30|      0|
|          0|             111.0|         65.0| 20.26110582322676| 145.4968462354687|24.6|                    0.66| 31|      0|
|          9|             122.0|         56.0|31.994558582756465|122.47686213223982|33.3|                   1.114| 33|      1|
|          1|             128.0|         98.0|              41.0|              58.0|32.0|                   1.321| 33|      1|
|          1|             172.0|         68.0|              49.0|             579.0|42.4|                   0.7

### 2.5. Convert features to vector column

In [6]:
from pyspark.ml.feature import VectorAssembler

label_name = 'Outcome'
feature_names = [x.name for x in training_sdf.schema if x.name != label_name]

assembler = VectorAssembler(
    inputCols=feature_names,
    outputCol='Features'
)

vectorized_training_sdf = assembler.transform(training_sdf).select('Features', 'Outcome')
vectorized_testing_sdf = assembler.transform(testing_sdf).select('Features', 'Outcome')

vectorized_training_sdf.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+-------+
|            Features|Outcome|
+--------------------+-------+
|[4.0,110.0,92.0,3...|      0|
|[0.0,111.0,65.0,2...|      0|
|[9.0,122.0,56.0,3...|      1|
|[1.0,128.0,98.0,4...|      1|
|[1.0,172.0,68.0,4...|      1|
|[4.0,184.0,78.0,3...|      1|
|[9.0,119.0,80.0,3...|      1|
|[0.0,147.0,85.0,5...|      0|
|[6.0,151.0,62.0,3...|      0|
|[8.0,143.0,66.0,3...|      1|
|[2.0,120.0,54.0,1...|      0|
|[12.0,88.0,74.0,4...|      0|
|[10.0,125.0,70.0,...|      1|
|[1.0,143.13753029...|      0|
|[10.0,179.0,70.0,...|      0|
|[6.0,129.0,90.0,7...|      0|
|[0.0,93.0,60.0,25...|      0|
|[10.0,139.0,80.0,...|      0|
|[8.0,109.0,76.0,3...|      1|
|[4.0,145.0,82.0,1...|      1|
+--------------------+-------+
only showing top 20 rows



                                                                                

## 3. Gradient boosting classifier model

## 3.1. Model training

In [7]:
# Create the gradient boosing classifier
gbt_classifier = GBTClassifier(labelCol=label_name, featuresCol='Features', maxIter=100)

# train and return the model
model = gbt_classifier.fit(vectorized_training_sdf)

                                                                                

### 3.2. Model evaluation

In [10]:
predictions = model.transform(vectorized_testing_sdf)

evaluator = MulticlassClassificationEvaluator(
    labelCol='Outcome',
    predictionCol='prediction',
    metricName='accuracy'
)

accuracy = evaluator.evaluate(predictions)
print(f'Test accuracy: {accuracy*100:.1f}%')

Test accuracy: 77.1%


## End SparkSession

In [11]:
spark.stop()