# Diabetes prediction: decision tree

## Notebook set-up

In [None]:
# Python standard library imports
import pickle
from pathlib import Path
from itertools import combinations

# PyPI imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier

# Internal imports
import configuration as config
import functions as funcs

## 1. Data loading

### 1.1. Load data from URL

In [None]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv'
data_df = pd.read_csv(url)
data_df.drop_duplicates().reset_index(drop=True, inplace=True)

### 1.2. Save a local copy

In [None]:
# Your code here


### 1.3. Inspect

In [None]:
# Your code here


## 2. EDA
### 2.1. Data composition

In [None]:
features = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
            'Insulin','BMI','DiabetesPedigreeFunction','Age']

# Plot feature distributions with histograms

### 2.2. Feature interactions

#### 2.2.1. Feature cross-correlations

In [None]:
# Draw scatter plots to show the correlation between each pair of features


#### 2.2.2. Feature-label interactions

In [None]:
# Use box plots to show the distribution for each feature in diabetes positive and negative populations


## 3. Data preparation

### 3.1. Test-train split

In [None]:
# Your code here


### 3.2. Imputation of zeros

In [None]:
# Take a close look at zero values in each feature. Is something strange going on?

In [None]:
# Devise and implement a strategy to fill in zeros with reasonable values


## 4. Model training

### 4.1. Baseline

In [None]:
# Set a performance baseline


### 4.2. Decision tree classifier

In [None]:
# Train and evaluate a decision tree


In [None]:
# Use cross validation to evaluate the model's performance:
# see: https://scikit-learn.org/stable/modules/cross_validation.html

cross_val_scores = {
    'Model': [],
    'Score': []
}

scores = cross_val_score(
    model,
    training_df.drop('Outcome', axis=1),
    training_df['Outcome'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Baseline']*len(scores))
cross_val_scores['Score'].extend(scores*100)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

## 4. Hyperparameter optimization

### 4.1. Hyperparameter grid search

In [None]:
# Perform a grid search on some or all of the hyperparameters from the slides


### 4.2. Hyperparameter optimization results

In [None]:
# Evaluate the results


## 5. Evaluation

### 5.1. Model comparison

In [None]:
# Compare the naive to the optimized model


### 5.2. Test set performance

In [None]:
# Evaluate the model on the test set


## 6. Save

### 6.1. Data

In [None]:
Path('../data/raw').mkdir(exist_ok=True, parents=True)
data_df.to_parquet(config.RAW_DATA_FILE)

Path('../data/processed').mkdir(exist_ok=True, parents=True)
data={
    'training': training_df,
    'testing': testing_df
}

with open(config.DATA_FILE, 'wb') as output_file:
    pickle.dump(data, output_file)

### 6.2. Optimized hyperparameters

In [None]:
Path('../models').mkdir(exist_ok=True, parents=True)

with open(config.DECISION_TREE_HYPERPARAMETERS, 'wb') as output_file:
    pickle.dump(hyperparameters, output_file)

### 6.3. Model

In [None]:
with open(config.DECISION_TREE_MODEL, 'wb') as output_file:
    pickle.dump(model, output_file)