In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import sklearn

In [2]:
#loading the iris dataset into a dataframe
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_df = pd.DataFrame(data = diabetes.data, columns = diabetes.feature_names)
diabetes_df["target"] = diabetes.target

In [3]:
#checking the first five rows of the data
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [4]:
#checking random rows of the dataframe
diabetes_df.sample(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
332,0.030811,-0.044642,0.104809,0.076958,-0.011201,-0.011335,-0.058127,0.034309,0.057104,0.036201,270.0
71,-0.001882,-0.044642,0.033673,0.125158,0.024574,0.026243,-0.010266,-0.002592,0.026714,0.061054,270.0
88,-0.052738,0.05068,-0.040696,-0.067642,-0.03184,-0.037013,0.037595,-0.039493,-0.034524,0.069338,42.0
247,-0.081798,-0.044642,-0.081653,-0.040099,0.002559,-0.018537,0.07073,-0.039493,-0.010904,-0.092204,51.0
290,0.059871,0.05068,0.076786,0.025315,0.001183,0.016849,-0.054446,0.034309,0.029936,0.044485,332.0


In [5]:
#checking the number of rows and columns of the dataframe
diabetes_df.shape

(442, 11)

In [6]:
#defining a function to get information about the data 
def get_info():
    info = pd.DataFrame(index = diabetes_df.columns)
    info["null values"] = diabetes_df.isnull().sum()
    info["duplicates"] = diabetes_df.duplicated().sum()
    info["unique values"] = diabetes_df.nunique()
    info["data type"] = diabetes_df.dtypes
    return info
get_info()

Unnamed: 0,null values,duplicates,unique values,data type
age,0,0,58,float64
sex,0,0,2,float64
bmi,0,0,163,float64
bp,0,0,100,float64
s1,0,0,141,float64
s2,0,0,302,float64
s3,0,0,63,float64
s4,0,0,66,float64
s5,0,0,184,float64
s6,0,0,56,float64


In [7]:
#assigning variables to the data 
X = diabetes_df.drop("target", axis = 1)
y = diabetes_df.target

In [8]:
#importing the train_test library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [9]:
#importing the model for prediction and training it
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor (n_estimators = 30)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30)

In [10]:
#values for X_test
X_test

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
327,0.074401,-0.044642,0.114509,0.028758,0.024574,0.024991,0.019187,-0.002592,-0.000609,-0.005220
150,0.023546,-0.044642,0.070319,0.025315,-0.034592,-0.014466,-0.032356,-0.002592,-0.019197,-0.009362
414,0.081666,0.050680,0.006728,-0.004523,0.109883,0.117056,-0.032356,0.091875,0.054724,0.007207
349,0.001751,0.050680,-0.057941,-0.043542,-0.096510,-0.047034,-0.098625,0.034309,-0.061177,-0.071494
242,-0.103593,0.050680,-0.023451,-0.022885,-0.086878,-0.067701,-0.017629,-0.039493,-0.078141,-0.071494
...,...,...,...,...,...,...,...,...,...,...
36,0.012648,-0.044642,0.022895,0.052858,0.008063,-0.028558,0.037595,-0.039493,0.054724,-0.025930
218,-0.041840,-0.044642,-0.065486,-0.040099,-0.005697,0.014344,-0.043401,0.034309,0.007027,-0.013504
366,-0.045472,0.050680,0.137143,-0.015999,0.041086,0.031880,-0.043401,0.071210,0.071022,0.048628
190,0.009016,-0.044642,-0.012673,0.028758,-0.018080,-0.005072,-0.047082,0.034309,0.023375,-0.005220


In [11]:
#values for the y_test
y_test

327    237.0
150    288.0
414    131.0
349     88.0
242     71.0
       ...  
36     265.0
218    214.0
366    233.0
190    292.0
350    243.0
Name: target, Length: 111, dtype: float64

In [12]:
#checking the accuracy of the model
model.score(X_test, y_test)

0.5278875997799195

In [13]:
#model prediction
y_predict = model.predict(X_test)
pd.DataFrame({"predicted_values" : y_predict})

Unnamed: 0,predicted_values
0,215.600000
1,178.266667
2,185.766667
3,95.366667
4,108.733333
...,...
106,220.366667
107,133.433333
108,235.833333
109,153.766667


In [14]:
#Evaluating the model using mean squared error
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_predict))
print(math.sqrt(mean_squared_error(y_test, y_predict)))

2834.0548348348348
53.23584163732959


In [15]:
#Evaluating the model using the Rsquared and Rsquared adjusted
import statsmodels.api as sm
X = sm.add_constant(X)
result = sm.OLS(y, X).fit()
print(result.rsquared, result.rsquared_adj)

0.5177494254132934 0.506560316954205
