# Diabetes prediction: gradient boosting model with XGBoost & Spark

## 1. Notebooks set-up

In [None]:
import pickle
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from xgboost.spark import SparkXGBRegressor

spark=SparkSession\
    .builder\
    .master('spark://0.0.0.0:7077')\
    .appName('GradientBoostingClassifier')\
    .getOrCreate()

## 2. Data preparation

### 2.1. Data loading

In [None]:
url='https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv'
data_df=pd.read_csv(url)
data_df.drop_duplicates().reset_index(drop=True, inplace=True)
data_df.head()

### 2.2. Train-test split

In [None]:
training_df, testing_df=train_test_split(
    data_df,
    test_size=0.25,
    random_state=315
)

### 2.3. Imputation of zeros

In [None]:
imputed_features=['Insulin','SkinThickness','BloodPressure','BMI','Glucose']
knn_imputer=KNNImputer(missing_values=0.0, weights='distance')
knn_imputer.fit(training_df[imputed_features])
training_df[imputed_features]=knn_imputer.transform(training_df[imputed_features])
testing_df[imputed_features]=knn_imputer.transform(testing_df[imputed_features])

### 2.4. Convert to Spark dataframe

In [None]:
training_sdf=spark.createDataFrame(training_df)
testing_sdf=spark.createDataFrame(testing_df)

training_sdf.show()

## 3. XGBoost model

In [None]:
label_name = 'Outcome'

# get a list with feature column names
feature_names = [x.name for x in training_sdf.schema if x.name != label_name]

# create a xgboost pyspark regressor estimator and set device="cuda"
regressor = SparkXGBRegressor(
    features_col=feature_names,
    label_col=label_name,
    num_workers=1
)

# train and return the model
model = regressor.fit(training_sdf)

# predict on test data
predict_df = model.transform(testing_sdf)
predict_df.show()

## End SparkSession

In [None]:
spark.stop()