In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [2]:
dataset = load_diabetes()

In [10]:
#Create data frame for the given values
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [4]:
#This will tell us the total number of non null observations present including the total number of entries. Once number of entries isn’t equal to number of non null observations, we can begin to suspect missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [5]:
#This will tell us the total number of NaN in or data.
df.isnull().sum()

age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

In [12]:
from sklearn.preprocessing import StandardScaler 
array = df.values
bparr = array[0:5,3:4]
print("BP array:\n",bparr)

BP array:
 [[ 0.02]
 [-0.03]
 [-0.01]
 [-0.04]
 [ 0.02]]


In [7]:
hdlarr = array[0:5,6:7]
print("HDL array: \n",hdlarr)

In [8]:
scaler = StandardScaler()
rescaledBP = scaler.fit_transform(bparr) 
np.set_printoptions(precision=2) 


In [13]:
rescaledhdl = scaler.fit_transform(hdlarr) 
np.set_printoptions(precision=2) 
print("Rescaled HDL:\n",rescaledhdl[0:5,:])

Rescaled HDL:
 [[-0.91]
 [ 1.56]
 [-0.68]
 [-0.76]
 [ 0.17]]


In [15]:
x_train, x_test, y_train, y_test = train_test_split(rescaledBP,rescaledhdl,test_size=0.3,random_state=42)
print("BP test:\n",x_test[0:10])

BP test:
 [[-0.34]
 [ 0.46]
 [-0.26]
 [ 1.67]
 [-0.05]
 [-0.55]
 [ 0.32]
 [ 0.46]
 [-1.2 ]
 [-0.41]]


In [16]:
print("HDL test:\n",y_test[0:10])

HDL test:
 [[ 0.4 ]
 [ 0.02]
 [ 1.18]
 [-1.61]
 [-0.14]
 [ 1.49]
 [-0.45]
 [ 0.02]
 [-0.14]
 [ 0.4 ]]


In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

x_train, x_test, y_train, y_test = train_test_split(rescaledBP,rescaledhdl,test_size=0.3,random_state=42)

In [18]:
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), activation='relu', max_iter=1000)
mlp.fit(x_train,y_train)

predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)

print("Predict train: \n",predict_train[0:10])
print("Predict test: \n",predict_test[0:10])

Predict train: 
 [-0.2  -0.2  -0.09 -0.2   0.11  0.18 -0.11  0.08 -0.13  0.07]
Predict test: 
 [ 0.11 -0.07  0.1  -0.19  0.07  0.1  -0.03 -0.07  0.11  0.11]


  y = column_or_1d(y, warn=True)


In [19]:
from sklearn.metrics import mean_absolute_error
print("Mean absolute error for training samples ",mean_absolute_error(predict_train,y_train))
print("Mean absolute error for testing samples ",mean_absolute_error(predict_test, y_test))

Mean absolute error for training samples  0.7570299931206426
Mean absolute error for testing samples  0.7903444141844985


In [None]:
from sklearn.metrics import mean_squared_error
print("Mean squared error for training samples ",mean_squared_error(predict_train,y_train))
print("Mean squared error for testing samples ",mean_squared_error(predict_test, y_test))

Mean squared error for training samples  0.9546008281348414
Mean squared error for testing samples  0.9452076921968656


In [None]:
from sklearn.metrics import r2_score
print("r2 score for training samples",r2_score(predict_train,y_train))
print("r2 score for testing samples",r2_score(predict_test, y_test))

r2 score for training samples -37.286606902149835
r2 score for testing samples -32.8990211141563


In [None]:
x_train, x_test, y_train, y_test = train_test_split(rescaledBP,rescaledhdl,test_size=0.50,random_state=42)
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), activation='relu', max_iter=1000)
mlp.fit(x_train,y_train)
predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)

  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.metrics import mean_absolute_error
print("Mean absolute error for training samples ",mean_absolute_error(predict_train,y_train))
print("Mean absolute error for testing samples ",mean_absolute_error(predict_test, y_test))
from sklearn.metrics import mean_squared_error
print("Mean squared error for training samples ",mean_squared_error(predict_train,y_train))
print("Mean squared error for testing samples ",mean_squared_error(predict_test, y_test))
from sklearn.metrics import r2_score
print("r2 score for training samples",r2_score(predict_train,y_train))
print("r2 score for testing samples",r2_score(predict_test, y_test))

Mean absolute error for training samples  0.7786924365571825
Mean absolute error for testing samples  0.7736004453846547
Mean squared error for training samples  1.0140401257801852
Mean squared error for testing samples  0.9197977682787534
r2 score for training samples -31.82549936095471
r2 score for testing samples -29.638265952071233


In [None]:
x_train, x_test, y_train, y_test = train_test_split(rescaledBP,rescaledhdl,test_size=0.75,random_state=42)
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), activation='relu',max_iter=1000)
mlp.fit(x_train,y_train)
predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)

  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.metrics import mean_absolute_error
print("Mean absolute error for training samples ",mean_absolute_error(predict_train,y_train))
print("Mean absolute error for testing samples ",mean_absolute_error(predict_test, y_test))
from sklearn.metrics import mean_squared_error
print("Mean squared error for training samples ",mean_squared_error(predict_train,y_train))
print("Mean squared error for testing samples ",mean_squared_error(predict_test, y_test))
from sklearn.metrics import r2_score
print("r2 score for training samples",r2_score(predict_train,y_train))
print("r2 score for testing samples",r2_score(predict_test, y_test))

Mean absolute error for training samples  0.7445598563208751
Mean absolute error for testing samples  0.8468949093357337
Mean squared error for training samples  0.9297458056467313
Mean squared error for testing samples  1.0900236688007257
r2 score for training samples -5.752192553058887
r2 score for testing samples -4.898403147742305


The most optimal sample is the 50% testing samples where the errors are minimum when compared to 30% and 75% 
samples. 