# Diabetes prediction: gradient boosting

## Notebook set-up

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

import configuration as config
import functions as funcs

## 1. Data loading

### 1.1. Load data from disk

Load the pre-processed data from the decision tree notebook:

In [None]:
with open(config.DATA_FILE, 'rb') as input_file:
    dataset = pickle.load(input_file)

training_df = dataset['training']
testing_df = dataset['testing']

### 1.2. Inspect

In [None]:
# Your code here


## 2. Model training

### 2.1. Previous scores

In [None]:
# Your code here


### 2.2. Gradient boosting model

In [None]:
# Fit a gradient boosting classifier to the data and check the training accuracy


In [None]:
# Score the gradient boosting classifier with cross-validation


## 3. Hyperparameter optimization

### 3.1. Hyperparameter search

In [None]:
# Optimize the gradient boosting classifier using with grid or random search


### 3.2. Hyperparameter optimization results

In [None]:
funcs.plot_cross_validation(search_results)

### 3.3. Cross-validation of optimized model

In [None]:
# Score the optimized model with cross-validation


## 4. Evaluation

### 4.1. Model comparison

In [None]:
# Compare the performance of the models

### 5.3. Test set performance

#### 5.3.1. Load the previous models models

In [None]:
models = {}

models['Logistic regression'] = LogisticRegression(class_weight=config.CLASS_WEIGHT, max_iter=5000).fit(training_df.drop('Outcome', axis=1), training_df['Outcome'])

with open(config.DECISION_TREE_MODEL, 'rb') as input_file:
    models['Decision tree'] = pickle.load(input_file)

with open(config.RANDOM_FOREST_MODEL, 'rb') as input_file:
    models['Random forest'] = pickle.load(input_file)

#### 5.3.2. Gradient boosting model

In [None]:
# Retrain the best model on the full training set
best_model = GradientBoostingClassifier(**winning_hyperparameters)
result = best_model.fit(training_df.drop('Outcome', axis=1), training_df['Outcome'])
models['Gradient boosting'] = best_model

#### 5.3.2. Evaluate each model

In [None]:
# Compare the performance of all of the optimized models on the test set


## 6. Save

### 6.1. Cross-validation scores

In [None]:
with open(config.CROSS_VAL_SCORES_FILE, 'wb') as output_file:
    pickle.dump(cross_val_scores, output_file)

In [None]:
with open(config.GRADIENT_BOOSTING_HYPERPARAMETERS, 'wb') as output_file:
    pickle.dump(winning_hyperparameters, output_file)

### 6.2. Model

In [None]:
# Save the winning hyperparameters
with open(config.GRADIENT_BOOSTING_HYPERPARAMETERS, 'wb') as output_file:
    pickle.dump(winning_hyperparameters, output_file)

# Save the best model
with open(config.GRADIENT_BOOSTING_MODEL, 'wb') as output_file:
    pickle.dump(model, output_file)