In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# loading the diabetes dataset
diabetes_data = load_diabetes()

In [3]:
# checking for properties of the dataset
dir(diabetes_data)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'frame',
 'target',
 'target_filename']

In [4]:
# checking the feature(column) names
diabetes_data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
# converting the dataset to a DataFrame
diabetes = pd.DataFrame(diabetes_data.data, columns = diabetes_data.feature_names)
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [6]:
# checking the shape of the DataFrame
diabetes.shape

(442, 10)

In [7]:
# adding the dependent variable column to the dataframe
diabetes['target'] = diabetes_data.target
diabetes.sample(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
78,0.005383,-0.044642,-0.057941,-0.022885,-0.067615,-0.068328,-0.054446,-0.002592,0.042896,-0.08392,252.0
49,-0.04184,0.05068,0.014272,-0.005671,-0.012577,0.006202,-0.072854,0.07121,0.035462,-0.013504,142.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
236,0.027178,-0.044642,0.006728,0.035644,0.079612,0.07071,0.015505,0.034309,0.040672,0.011349,67.0
313,0.059871,0.05068,0.053074,0.052858,0.03283,0.019667,-0.010266,0.034309,0.055205,-0.001078,163.0


In [8]:
# assigning the independent and dependent variables
x = diabetes.drop('target', axis=1)
y = diabetes[['target']]

In [9]:
# splitting the dataset into test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30)

In [10]:
x

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [11]:
y

Unnamed: 0,target
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


In [12]:
# training the model
model = RandomForestClassifier(n_estimators = 20)
model.fit(x_train, y_train)

  model.fit(x_train, y_train)


RandomForestClassifier(n_estimators=20)

In [13]:
# checking the accuracy of the model
model.score(x_test, y_test)

0.007518796992481203

In [14]:
y_predict = model.predict(x_test)

In [15]:
y_predict

array([ 72.,  97.,  65.,  78., 124., 187.,  45.,  31., 220.,  61., 137.,
       202., 292., 140.,  97.,  48., 111.,  45.,  97., 164., 122., 279.,
       128.,  99., 128., 102., 102., 200., 128., 164., 170., 142.,  60.,
        85., 209.,  47., 108., 116., 128.,  47., 174., 101., 151.,  84.,
       150., 126.,  61.,  96.,  97.,  68., 170.,  49.,  71.,  66., 332.,
        52., 202., 200., 202.,  68., 275.,  63., 268.,  70.,  94.,  60.,
       179.,  77.,  44.,  85., 274.,  65.,  52.,  68., 137.,  53.,  45.,
        68., 306.,  65., 147.,  72.,  84.,  53., 295., 109.,  61., 235.,
       118., 102., 151., 258.,  39., 128., 127.,  90., 121., 122., 128.,
        65., 336.,  84.,  81.,  96.,  64.,  64., 270.,  97.,  77., 248.,
       181., 183., 122., 121., 302., 118., 166.,  99., 140., 258., 248.,
        45., 110.,  47., 219., 187.,  55.,  64., 118., 166., 182.,  50.,
        68.])

In [16]:
con_max = confusion_matrix(y_test, y_predict)
con_max

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# trying with a random forest regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 20)
model.fit(x_train, y_train)

  model.fit(x_train, y_train)


RandomForestRegressor(n_estimators=20)

In [35]:
model.score(x_test, y_test)

0.3985848349812059

In [19]:
# making predictions using test data
y_predict = model.predict(x_test)

In [20]:
y_predict

array([ 87.8  ,  82.725, 104.1  , 216.825, 181.1  , 149.375, 125.75 ,
       130.475, 205.875, 142.475,  87.85 , 160.425, 143.55 , 148.525,
        99.225,  87.275, 147.325, 146.85 , 105.45 , 166.975, 166.275,
       150.425, 168.375,  91.1  , 200.025, 104.7  ,  99.7  , 119.35 ,
       183.05 , 139.225, 156.975, 206.625, 188.9  ,  99.475, 220.425,
       110.8  , 211.8  , 120.225, 205.05 ,  97.6  , 192.95 , 124.05 ,
       212.55 , 118.025, 129.35 , 142.825, 102.2  , 107.65 ,  93.325,
       159.825, 164.875, 163.975, 118.85 , 162.6  , 279.6  ,  74.325,
       120.75 , 116.45 , 134.3  , 135.775, 197.5  , 115.5  , 195.075,
       161.075, 149.45 , 192.875, 109.3  , 135.175, 179.65 ,  91.85 ,
       254.175, 125.5  , 172.725, 211.2  , 128.225, 117.225,  89.475,
       203.4  , 272.875,  91.   , 112.775,  95.9  , 100.025, 102.775,
       275.8  , 127.775, 137.25 , 194.5  ,  74.825, 122.775, 234.   ,
       147.7  , 172.9  , 173.975, 170.2  , 133.725, 130.575, 139.7  ,
       173.5  , 109.

It seems a confusion matrix will not work here because it is a classification metric so it can't be used with a regression problem