In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# loading the diabetes dataset
diabetes_data = load_diabetes()

In [3]:
# checking for properties of the dataset
dir(diabetes_data)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'frame',
 'target',
 'target_filename']

In [4]:
# checking the feature(column) names
diabetes_data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
# converting the dataset to a DataFrame
diabetes = pd.DataFrame(diabetes_data.data, columns = diabetes_data.feature_names)
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [6]:
# checking the shape of the DataFrame
diabetes.shape

(442, 10)

In [7]:
# adding the dependent variable column to the dataframe
diabetes['target'] = diabetes_data.target
diabetes.sample(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
369,-0.009147,-0.044642,0.037984,-0.040099,-0.02496,-0.003819,-0.043401,0.015858,-0.005145,0.027917,167.0
433,-0.02731,-0.044642,-0.060097,-0.029771,0.046589,0.01998,0.122273,-0.039493,-0.051401,-0.009362,72.0
96,0.056239,0.05068,0.009961,0.049415,-0.004321,-0.012274,-0.043401,0.034309,0.060788,0.032059,150.0
98,0.001751,0.05068,-0.005128,-0.012556,-0.015328,-0.01384,0.008142,-0.039493,-0.00608,-0.067351,92.0
55,-0.04184,-0.044642,-0.049318,-0.036656,-0.007073,-0.022608,0.085456,-0.039493,-0.066488,0.007207,128.0


In [8]:
# assigning the independent and dependent variables
x = diabetes.drop('target', axis=1)
y = diabetes[['target']]

In [9]:
# splitting the dataset into test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30)

In [10]:
x

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [11]:
y

Unnamed: 0,target
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


In [12]:
# training the model
model = RandomForestClassifier(n_estimators = 20)
model.fit(x_train, y_train)

  model.fit(x_train, y_train)


RandomForestClassifier(n_estimators=20)

In [13]:
# checking the accuracy of the model
model.score(x_test, y_test)

0.015037593984962405

In [14]:
y_predict = model.predict(x_test)

In [15]:
y_predict

array([ 84., 270.,  72., 152.,  96., 113.,  85., 170., 131., 154., 144.,
        72., 336., 160.,  72., 126., 101., 170.,  44., 268., 198., 147.,
        84., 152.,  78.,  91., 225., 138., 103.,  63., 113., 195.,  31.,
        91., 187.,  81.,  77., 220.,  63.,  89.,  88., 163., 143.,  93.,
       219.,  42., 156., 208., 245., 160., 200.,  75., 279.,  55., 258.,
       160., 253., 111.,  65.,  66.,  83., 185., 275.,  88., 198., 118.,
       220.,  66., 272.,  72.,  55., 101.,  72., 170., 111., 139., 279.,
        49., 152.,  96., 144.,  77.,  97., 182., 150., 281.,  53., 225.,
        91., 110.,  47., 152., 336., 198.,  69.,  55.,  91.,  73., 235.,
        52., 310., 138., 230.,  55.,  81., 179.,  84., 152., 137., 102.,
       152., 155., 270., 129., 198.,  83., 275.,  71., 142.,  90.,  99.,
        90.,  81.,  60., 139., 175.,  60.,  55.,  59.,  75., 144.,  72.,
        94.])

In [16]:
con_max = confusion_matrix(y_test, y_predict)
con_max

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
# trying with a random forest regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 20)
model.fit(x_train, y_train)

  model.fit(x_train, y_train)


RandomForestRegressor(n_estimators=20)

In [18]:
model.score(x_test, y_test)

0.4207529688345659

In [19]:
# making predictions using test data
y_predict = model.predict(x_test)

In [20]:
y_predict

array([118.55, 267.6 , 116.45, 184.75,  90.65,  74.3 , 245.55, 114.65,
       170.55, 200.6 , 139.1 ,  77.35, 289.1 , 148.65, 106.95, 153.25,
       149.65, 139.4 , 142.05, 189.4 , 257.05, 159.6 , 112.8 , 175.75,
       139.8 , 151.  , 173.15, 159.8 , 126.6 ,  78.95,  95.5 , 195.2 ,
       154.7 , 212.15, 146.3 , 148.55, 224.5 , 207.55,  97.1 , 167.4 ,
       145.4 , 244.55,  94.55, 129.6 , 198.75, 107.05, 115.05, 179.75,
       222.25, 165.65, 195.35,  71.55, 155.55, 101.1 , 265.8 , 130.6 ,
       116.15,  81.9 , 111.  , 210.6 , 123.75, 175.9 , 205.25, 126.85,
       132.55, 157.45, 241.35, 127.  , 213.6 ,  83.95, 155.05,  87.8 ,
       111.5 , 115.6 ,  88.95, 224.5 , 233.8 ,  85.9 , 140.55,  92.3 ,
       137.45, 155.55, 198.6 , 107.85, 241.2 , 186.2 , 141.85, 186.35,
       205.8 , 230.75, 109.5 , 247.85, 263.45, 222.7 ,  82.25, 177.6 ,
       159.65, 186.4 , 239.5 , 164.  , 273.9 , 182.05, 169.95,  74.45,
       122.55, 130.15, 126.5 , 267.45, 187.2 , 156.3 , 190.65, 197.1 ,
      

It seems a confusion matrix will not work here because it is a classification metric so it can't be used with a regression problem