# Using the diabetes dataset in sklearn, build a random forest tree model.

In [2]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Loading inbuilt dataset 2: Diabetes
from sklearn.datasets import load_diabetes
db= load_diabetes()

In [4]:
#exploring diabetes dataset
dir(db)

['DESCR',
 'data',
 'data_filename',
 'data_module',
 'feature_names',
 'frame',
 'target',
 'target_filename']

In [5]:
db.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [6]:
db.target[0:15]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118.])

In [7]:
#converting diabetes dataset to a dataframe
db_df=pd.DataFrame({
    'age':db.data[:,0],
    'sex':db.data[:,1],
    'bmi':db.data[:,2],
    'bp':db.data[:,3],
    's1':db.data[:,4],
    's2':db.data[:,5],
    's3':db.data[:,6],
    's4':db.data[:,7],
    's5':db.data[:,8],
    's6':db.data[:,9],
    'target':db.target
})

In [8]:
#Calling the first 10 rows to visualize
db_df.head(10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.04118,-0.096346,97.0
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062913,-0.038357,138.0
7,0.063504,0.05068,-0.001895,0.06663,0.09062,0.108914,0.022869,0.017703,-0.035817,0.003064,63.0
8,0.041708,0.05068,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014956,0.011349,110.0
9,-0.0709,-0.044642,0.039062,-0.033214,-0.012577,-0.034508,-0.024993,-0.002592,0.067736,-0.013504,310.0


In [9]:
#checking for the info of the diabetes dataset to know the data types distribution
db_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [10]:
#checking for ten random distribution of the diabetes dataset
db_df.sample(10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
395,-0.060003,-0.044642,0.001339,-0.029771,-0.007073,-0.021669,0.011824,-0.002592,0.031815,-0.054925,258.0
415,-0.005515,-0.044642,0.008883,-0.050428,0.02595,0.047224,-0.043401,0.07121,0.014823,0.003064,174.0
73,0.012648,0.05068,-0.020218,-0.002228,0.038334,0.053174,-0.006584,0.034309,-0.005145,-0.009362,111.0
312,-0.074533,-0.044642,-0.023451,-0.005671,-0.020832,-0.014153,0.015505,-0.039493,-0.038459,-0.030072,144.0
374,-0.107226,-0.044642,-0.034229,-0.067642,-0.063487,-0.07052,0.008142,-0.039493,-0.000609,-0.079778,140.0
99,-0.001882,-0.044642,-0.064408,0.011544,0.027326,0.037517,-0.013948,0.034309,0.011784,-0.054925,83.0
257,-0.02731,0.05068,-0.055785,0.025315,-0.007073,-0.023547,0.052322,-0.039493,-0.005145,-0.050783,63.0
364,0.001751,0.05068,-0.006206,-0.019442,-0.009825,0.004949,-0.039719,0.034309,0.014823,0.098333,262.0
288,0.070769,0.05068,-0.016984,0.021872,0.043837,0.056305,0.037595,-0.002592,-0.070209,-0.017646,80.0
303,0.074401,-0.044642,0.034751,0.094173,0.057597,0.020293,0.022869,-0.002592,0.073802,-0.021788,236.0


In [11]:
#checking the number of rows and columns of the diabetes dataset
db_df.shape

(442, 11)

In [13]:
#defining a function to further explore the diabetes dataset
def db_info():
    info = pd.DataFrame(index = db_df.columns)
    info["null values"] = db_df.isnull().sum()
    info["duplicate"] = db_df.duplicated().sum()
    info["unique values"] = db_df.nunique()
    info["data type"] = db_df.dtypes
    return info
db_info()

Unnamed: 0,null values,duplicate,unique values,data type
age,0,0,58,float64
sex,0,0,2,float64
bmi,0,0,163,float64
bp,0,0,100,float64
s1,0,0,141,float64
s2,0,0,302,float64
s3,0,0,63,float64
s4,0,0,66,float64
s5,0,0,184,float64
s6,0,0,56,float64


Training and Testing

Importing train_test_split function

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
#assigning feature and target labels 
X= db_df.drop("target", axis = 1)
y= db_df.target

In [22]:
#splitting iris dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3) #80% training and 20% testing.

In [23]:
#importing the model for prediction using Random Forest Regressor 
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor (n_estimators = 100)
model.fit(X_train, y_train)

RandomForestRegressor()

In [24]:
#Calling X_test
X_test

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
44,0.045341,0.050680,0.068163,0.008101,-0.016704,0.004636,-0.076536,0.071210,0.032433,-0.017646
243,0.016281,0.050680,-0.046085,0.011544,-0.033216,-0.016032,-0.010266,-0.002592,-0.043985,-0.042499
258,0.078034,0.050680,-0.024529,-0.042395,0.006687,0.052861,-0.069172,0.080804,-0.037128,0.056912
403,-0.020045,-0.044642,0.097264,-0.005671,-0.005697,-0.023861,-0.021311,-0.002592,0.061686,0.040343
18,-0.038207,-0.044642,-0.010517,-0.036656,-0.037344,-0.019476,-0.028674,-0.002592,-0.018118,-0.017646
...,...,...,...,...,...,...,...,...,...,...
166,-0.056370,0.050680,-0.060097,-0.036656,-0.088254,-0.070833,-0.013948,-0.039493,-0.078141,-0.104630
381,-0.070900,0.050680,-0.089197,-0.074528,-0.042848,-0.025739,-0.032356,-0.002592,-0.012908,-0.054925
7,0.063504,0.050680,-0.001895,0.066630,0.090620,0.108914,0.022869,0.017703,-0.035817,0.003064
390,0.009016,0.050680,0.069241,0.059744,0.017694,-0.023234,-0.047082,0.034309,0.103292,0.073480


In [25]:
#Calling y_test
y_test

44     259.0
243     47.0
258     89.0
403    275.0
18      97.0
       ...  
166     70.0
381    104.0
7       63.0
390    277.0
149    126.0
Name: target, Length: 133, dtype: float64

In [26]:
#checking for accuracy of model
model.score(X_test, y_test)

0.5025946633458601

Prediction of Diabetes dataset

In [28]:
y_predicted_value = model.predict(X_test)
pd.DataFrame ({"Predicted Values" : y_predicted_value})

Unnamed: 0,Predicted Values
0,199.73
1,87.39
2,126.74
3,250.19
4,125.57
...,...
128,95.25
129,117.80
130,133.51
131,266.16


In [31]:
#Evaluation of model using mean squared error
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_predicted_value))
print(math.sqrt(mean_squared_error(y_test, y_predicted_value)))

2942.2645661654133
54.24264527256587


In [32]:
#Evaluation of model using the Rsquared and Rsquared adjusted
import statsmodels.api as sm
X = sm.add_constant(X)
result = sm.OLS(y, X).fit()
print(result.rsquared, result.rsquared_adj)

0.5177494254132934 0.506560316954205
