<a href="https://colab.research.google.com/github/CSpanias/ml_practice/blob/master/DecisionTree_pidiabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
# load data
from google.colab import files
uploaded = files.upload()

Saving diabetes.csv to diabetes.csv


In [4]:
# make a list of headers for the dataset (*cannot pass them as X later when the
# original headers are retained - it's not a whitespace issue!)
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree',
    'age', 'label']

# read the CSV file, ignore the top row, and use the above headers
df = pd.read_csv('diabetes.csv', skiprows = 1, names=col_names)

# print the first 5 rows of data
df.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# print info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pregnant  768 non-null    int64  
 1   glucose   768 non-null    int64  
 2   bp        768 non-null    int64  
 3   skin      768 non-null    int64  
 4   insulin   768 non-null    int64  
 5   bmi       768 non-null    float64
 6   pedigree  768 non-null    float64
 7   age       768 non-null    int64  
 8   label     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
# check for missing values
df.isna().sum()

pregnant    0
glucose     0
bp          0
skin        0
insulin     0
bmi         0
pedigree    0
age         0
label       0
dtype: int64

In [7]:
# check for duplicates
df.duplicated().sum()

0

In [8]:
# assign predictors/features
X = df[['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree',
        'age']]
# assing target
y = df['label']
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state=44,
                                                    shuffle=True)
# select model with default settings
model = DecisionTreeClassifier()
# train the model
model.fit(X_train, y_train)
# predict on the training data
y_pred_train = model.predict(X_train)
# predict on the test data
y_pred = model.predict(X_test)

# evaluate the model
print("Classification Report:\n:", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("\nTraining Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred), 4))


Classification Report:
:               precision    recall  f1-score   support

           0       0.79      0.83      0.81        90
           1       0.75      0.69      0.72        64

    accuracy                           0.77       154
   macro avg       0.77      0.76      0.76       154
weighted avg       0.77      0.77      0.77       154


Confusion Matrix:
 [[75 15]
 [20 44]]

Training Accuracy: 1.0
Testing Accuracy: 0.7727


In [9]:
# choose how many train/test sets we want by "n_splits"
kfold = KFold(n_splits=5, shuffle=True)

# instantiate SVM model with default parameters
model = DecisionTreeClassifier()

# calculate the accuracy score of each of the 5 train/test sets
model_scores = cross_val_score(model, X, y, cv=kfold)

# print cross-validation scores with default model parameters
print('Stratified cross-validation scores with default model parameters:\n'
'    Split 1    Split 2    Split 3    Split 4    Split 5\n', (model_scores))

# print average cross-validation score with default parameteres
print('\nAverage stratified cross-validation score with default model parameters:\n\n',
      round(model_scores.mean(), 4))

Stratified cross-validation scores with default model parameters:
    Split 1    Split 2    Split 3    Split 4    Split 5
 [0.72727273 0.70779221 0.71428571 0.67973856 0.70588235]

Average stratified cross-validation score with default model parameters:

 0.707


In [10]:
# instantiate classifier with default parameters
model = DecisionTreeClassifier()

# declare parameters for hyperparameter tuning
params = [
          {'criterion': ['gini', 'entropy'],
           'splitter': ['best', 'random'],
           'max_depth': [10, 100, 150],
           'min_samples_split': [1, 2, 4, 8],
           'min_samples_leaf': [1, 2, 3],
           'max_features': ['auto', 'sqrt', 'log2'],
           'max_leaf_nodes': [1, 3, 'None']
           }
          ]

# create the grid search object
gs = GridSearchCV(model, params, scoring='accuracy', cv=5, n_jobs=1, verbose=3)

# fit the gs object
gs.fit(X_train, y_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 1/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=100, max_features=sqrt, max_leaf_nodes=1, min_samples_leaf=2,

4860 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 254, in fit
    % self.min_samples_split
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

---------------------------------------------------------------

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [10, 100, 150],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'max_leaf_nodes': [1, 3, 'None'],
                          'min_samples_leaf': [1, 2, 3],
                          'min_samples_split': [1, 2, 4, 8],
                          'splitter': ['best', 'random']}],
             scoring='accuracy', verbose=3)

In [13]:
# get the best parameters of the model
print('Parameters that give the best results:', gs.best_params_)

# print estimator that was chosen by the GridSearch
print('\nEstimator that was chosen by the search:', gs.best_estimator_)

# get the model with the mean cross-validation training accuracy
print('\nMean cross-validation training accucary:', (round(gs.best_score_, 4)))

# calculate GridSearch CV score
print('\nGridSearchCV score:', round(gs.score(X_test, y_test), 4))

Parameters that give the best results: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 3, 'min_samples_leaf': 3, 'min_samples_split': 4, 'splitter': 'best'}

Estimator that was chosen by the search: DecisionTreeClassifier(max_depth=10, max_features='log2', max_leaf_nodes=3,
                       min_samples_leaf=3, min_samples_split=4)

Mean cross-validation training accucary: 0.7411

GridSearchCV score: 0.7792


In [17]:
model = DecisionTreeClassifier(max_depth=10, max_features='log2', max_leaf_nodes=3,
                       min_samples_leaf=3, min_samples_split=4)
model.fit(X_train, y_train)
y_pred_GS = model.predict(X_test)
print(accuracy_score(y_test, y_pred_GS))

0.7792207792207793
