# Diabetes prediction model using Decision Tree Classifier 
## 1. Data Download and inspection 

### 1.1 Import Modules 

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn import tree
from notebook_helper_functions import impute
from notebook_helper_functions import plot_scatter_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFECV
from notebook_helper_functions import plot_cross_validation
from scipy.stats import uniform
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV


### 1.2 Data Download 

In [8]:
data_df=pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")
data_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
#Remove duplicates and reset index 
data_df.drop_duplicates().reset_index(drop=True, inplace=True)
data_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### 1.3 Train-test split 

In [10]:
testing_fraction = 0.33
random_seed=42

In [11]:
training_df, testing_df = train_test_split(
    data_df,
    test_size=testing_fraction,
    random_state=random_seed,
)

In [12]:
training_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
464,10,115,98,0,0,24.0,1.022,34,0
223,7,142,60,33,190,28.8,0.687,61,0
393,4,116,72,12,87,22.1,0.463,37,0
766,1,126,60,0,0,30.1,0.349,47,1
570,3,78,70,0,0,32.5,0.270,39,0
...,...,...,...,...,...,...,...,...,...
71,5,139,64,35,140,28.6,0.411,26,0
106,1,96,122,0,0,22.4,0.207,27,0
270,10,101,86,37,0,45.6,1.136,38,1
435,0,141,0,0,0,42.4,0.205,29,1


In [13]:
testing_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
668,6,98,58,33,190,34.0,0.430,43,0
324,2,112,75,32,0,35.7,0.148,21,0
624,2,108,64,0,0,30.8,0.158,21,0
690,8,107,80,0,0,24.6,0.856,34,0
473,7,136,90,0,0,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...,...
311,0,106,70,37,148,39.4,0.605,22,0
429,1,95,82,25,180,35.0,0.233,43,1
182,1,0,74,20,23,27.7,0.299,21,0
586,8,143,66,0,0,34.9,0.129,41,1


In [14]:
# Reset the index of the training and testing dataframes
training_df.reset_index(drop=True, inplace=True)
testing_df.reset_index(drop=True, inplace=True)

In [15]:
training_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,10,115,98,0,0,24.0,1.022,34,0
1,7,142,60,33,190,28.8,0.687,61,0
2,4,116,72,12,87,22.1,0.463,37,0
3,1,126,60,0,0,30.1,0.349,47,1
4,3,78,70,0,0,32.5,0.270,39,0
...,...,...,...,...,...,...,...,...,...
509,5,139,64,35,140,28.6,0.411,26,0
510,1,96,122,0,0,22.4,0.207,27,0
511,10,101,86,37,0,45.6,1.136,38,1
512,0,141,0,0,0,42.4,0.205,29,1


In [16]:
testing_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.430,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...,...
249,0,106,70,37,148,39.4,0.605,22,0
250,1,95,82,25,180,35.0,0.233,43,1
251,1,0,74,20,23,27.7,0.299,21,0
252,8,143,66,0,0,34.9,0.129,41,1


In [17]:
# Set Model features and Labels
training_labels=training_df["Outcome"]
training_labels




0      0
1      0
2      0
3      1
4      0
      ..
509    0
510    0
511    1
512    1
513    0
Name: Outcome, Length: 514, dtype: int64

In [18]:
training_features=training_df.drop('Outcome', axis=1)
training_features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,10,115,98,0,0,24.0,1.022,34
1,7,142,60,33,190,28.8,0.687,61
2,4,116,72,12,87,22.1,0.463,37
3,1,126,60,0,0,30.1,0.349,47
4,3,78,70,0,0,32.5,0.270,39
...,...,...,...,...,...,...,...,...
509,5,139,64,35,140,28.6,0.411,26
510,1,96,122,0,0,22.4,0.207,27
511,10,101,86,37,0,45.6,1.136,38
512,0,141,0,0,0,42.4,0.205,29
