# Diabetes prediction: gradient boosting

## Notebook set-up

In [5]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import GradientBoostingClassifier

import configuration as config
import functions as funcs

## 1. Data preparation

### 1.1. Load data from disk

In [6]:
with open(config.DATA_FILE, 'rb') as input_file:
    dataset=pickle.load(input_file)

training_df=dataset['training']
testing_df=dataset['testing']

### 1.2. Inspect

In [7]:
training_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
717,10,94.0,72.0,18.0,,23.1,0.595,56,0
64,7,114.0,66.0,,,32.8,0.258,42,1
510,12,84.0,72.0,31.0,,29.7,0.297,46,1
104,2,85.0,65.0,,,39.6,0.93,27,0
657,1,120.0,80.0,48.0,200.0,38.9,1.162,41,0


## 2. Baseline model

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

hgb = HistGradientBoostingClassifier(random_state=42)  # accepts NaNs
hgb.fit(X_train, y_train)
y_pred = hgb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.75


In [20]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd

X = training_df.drop("Outcome", axis=1)
y = training_df["Outcome"]

# optional: see where NaNs are
print(X.isna().sum().sort_values(ascending=False).head())

pipe = make_pipeline(
    SimpleImputer(strategy="median"),          # fills NaNs
    GradientBoostingClassifier(random_state=42)
)

cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

scores = cross_validate(
    pipe, X, y,
    cv=cv,
    scoring={"accuracy": "accuracy", "f1": "f1"},
    n_jobs=-1,
    return_train_score=False
)

print(f"CV Accuracy: {np.mean(scores['test_accuracy']):.3f} ± {np.std(scores['test_accuracy']):.3f}")

Insulin          193
SkinThickness    122
BloodPressure     20
BMI                7
Glucose            3
dtype: int64
CV Accuracy: 0.761 ± 0.025


## 3. Hyperparameter optimization

### 3.1. Hyperparameter grid search

In [None]:
# Your code here...

### 3.2. Hyperparameter optimization results

In [None]:
funcs.plot_cross_validation(search_results)

### 3.3. Cross-validation of optimized model

In [None]:
# Your code here...

## 4. Evaluation

### 4.1. Model comparison

In [None]:
# This time, compare the optimized version of all three models: decision tree, random forest and gradient boosting. You might also want to throw in a logistic regression as a standard of comparison.

# Your code here...

sns.boxplot(pd.DataFrame.from_dict(cross_val_scores), x='Model', y='Score')
plt.show()

### 4.2. Test set performance

In [None]:
# Evaluate the test set performance of all three models
# Your code here...

## 5. Save

### 5.1. Optimized hyperparameters

In [None]:
with open(config.GRADIENT_BOOSTING_HYPERPARAMETERS, 'wb') as output_file:
    pickle.dump(hyperparameters.best_params_, output_file)

### 5.2. Model

In [None]:
with open(config.GRADIENT_BOOSTING_MODEL, 'wb') as output_file:
    pickle.dump(model, output_file)