<a href="https://colab.research.google.com/github/BitanGh/udemy/blob/main/Diabetes_Prediction_using_DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
#Dataset loading
from sklearn.datasets import load_diabetes
dataset = load_diabetes()
print(dataset.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [7]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
df.head()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [8]:
X = df
y = dataset.target
X,y

(          age       sex       bmi        bp        s1        s2        s3  \
 0    0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
 1   -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
 2    0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
 3   -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
 4    0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   
 ..        ...       ...       ...       ...       ...       ...       ...   
 437  0.041708  0.050680  0.019662  0.059744 -0.005697 -0.002566 -0.028674   
 438 -0.005515  0.050680 -0.015906 -0.067642  0.049341  0.079165 -0.028674   
 439  0.041708  0.050680 -0.015906  0.017293 -0.037344 -0.013840 -0.024993   
 440 -0.045472 -0.044642  0.039062  0.001215  0.016318  0.015283 -0.028674   
 441 -0.045472 -0.044642 -0.073030 -0.081413  0.083740  0.027809  0.173816   
 
            s4        s5        s6  
 0   -0.002592  0.019907 

In [9]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((353, 10), (89, 10), (353,), (89,))

In [10]:
#Decision tree regressior
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [13]:
#Hyperparameter Tuning
param = {
    'criterion':['squared_error', 'absolute_error', 'poisson', 'friedman_mse' ],
    'splitter':['best', 'random'],
    'max_depth':[1,2,3,4,5,6,7,8,9,10],


}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(dt, param, cv=5)
grid.fit(X_train, y_train)

In [14]:
grid.best_params_

{'criterion': 'poisson', 'max_depth': 2, 'splitter': 'best'}

In [15]:
#Prediction
y_pred = grid.predict(X_test)
y_pred

array([156.75342466, 191.10169492, 156.75342466, 191.10169492,
        97.26470588,  97.26470588, 271.07692308, 191.10169492,
       156.75342466, 191.10169492,  97.26470588, 156.75342466,
        97.26470588, 191.10169492,  97.26470588, 191.10169492,
       191.10169492, 271.07692308, 191.10169492, 191.10169492,
       191.10169492,  97.26470588,  97.26470588, 191.10169492,
       191.10169492, 191.10169492, 191.10169492,  97.26470588,
        97.26470588,  97.26470588, 271.07692308,  97.26470588,
       191.10169492, 156.75342466, 191.10169492, 271.07692308,
       156.75342466, 156.75342466, 156.75342466,  97.26470588,
        97.26470588,  97.26470588, 156.75342466, 156.75342466,
       156.75342466,  97.26470588,  97.26470588,  97.26470588,
        97.26470588, 156.75342466, 191.10169492,  97.26470588,
       191.10169492,  97.26470588, 156.75342466, 191.10169492,
        97.26470588, 191.10169492,  97.26470588,  97.26470588,
       191.10169492, 191.10169492, 191.10169492,  97.26

In [17]:
#Accuracy
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)



0.2770138039725839