## Imports

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.datasets import load_diabetes

## Load dataset

In [2]:
data = load_diabetes()

## Convert to DataFrame

In [3]:
df = pd.DataFrame(data.data, columns=data.feature_names)

## Display DataFrame shape, types, and first few rows

In [4]:
print(df.shape)

(442, 10)


In [5]:
print(df.dtypes)

age    float64
sex    float64
bmi    float64
bp     float64
s1     float64
s2     float64
s3     float64
s4     float64
s5     float64
s6     float64
dtype: object


In [6]:
print(df.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  


In [7]:
data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [8]:
df.isnull().sum()

age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [10]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [11]:
df.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
age,1.0,0.173737,0.185085,0.335428,0.260061,0.219243,-0.075181,0.203841,0.270774,0.301731
sex,0.173737,1.0,0.088161,0.24101,0.035277,0.142637,-0.37909,0.332115,0.149916,0.208133
bmi,0.185085,0.088161,1.0,0.395411,0.249777,0.26117,-0.366811,0.413807,0.446157,0.38868
bp,0.335428,0.24101,0.395411,1.0,0.242464,0.185548,-0.178762,0.25765,0.39348,0.39043
s1,0.260061,0.035277,0.249777,0.242464,1.0,0.896663,0.051519,0.542207,0.515503,0.325717
s2,0.219243,0.142637,0.26117,0.185548,0.896663,1.0,-0.196455,0.659817,0.318357,0.2906
s3,-0.075181,-0.37909,-0.366811,-0.178762,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697
s4,0.203841,0.332115,0.413807,0.25765,0.542207,0.659817,-0.738493,1.0,0.617859,0.417212
s5,0.270774,0.149916,0.446157,0.39348,0.515503,0.318357,-0.398577,0.617859,1.0,0.464669
s6,0.301731,0.208133,0.38868,0.39043,0.325717,0.2906,-0.273697,0.417212,0.464669,1.0


In [12]:
# Target variable
target = pd.DataFrame(data.target)
target

Unnamed: 0,0
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


## Split into training and test sets

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2)

## Linear Regression - Baseline model 

In [14]:
#Training and evaluating SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [15]:
# Instantiate and train model
linearReg = LinearRegression()
linearReg.fit(X_train, y_train.squeeze())

In [16]:
# Make predictions
y_pred = linearReg.predict(X_test)

In [17]:
# Calculate and print performance metrics
print("Linear Regression metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

Linear Regression metrics:
R2 score: 0.4679266274292606
Mean squared error: 3337.702513178748
Mean absolute error: 46.678080381788696


## SGD Regressor

In [18]:
#Training and evaluating SGDRegressor
from sklearn.linear_model import SGDRegressor

In [19]:
# Instantiate and train model
sgdR = SGDRegressor(max_iter=10000)
sgdR.fit(X_train, y_train.squeeze())

In [20]:
# Make predictions
y_pred = sgdR.predict(X_test)

In [21]:
# Calculate and print performance metrics
print("SGDRegressor metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

SGDRegressor metrics:
R2 score: 0.4660015696505453
Mean squared error: 3349.778423226621
Mean absolute error: 46.71356287967627


## Decision Tree Regressor

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
# Instantiate and train model
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train.squeeze())

In [24]:
# Make predictions
y_pred = dtr.predict(X_test)

In [25]:
# Calculate and print performance metrics
print("DecisionTreeRegressor metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

DecisionTreeRegressor metrics:
R2 score: 0.022504189397769436
Mean squared error: 6131.842696629214
Mean absolute error: 62.87640449438202


## Random Forest Regressor

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
# Instantiate and train model
rf = RandomForestRegressor()
rf.fit(X_train, y_train.squeeze())

In [28]:
# Make predictions
y_pred = rf.predict(X_test)

In [29]:
# Calculate and print performance metrics
print("RandomForestRegressor metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

RandomForestRegressor metrics:
R2 score: 0.4413635332873428
Mean squared error: 3504.333114606741
Mean absolute error: 48.367191011235946


## KNN Regressor

In [30]:
#Training and evaluating KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor

In [31]:
# Instantiate and train model
knn = KNeighborsRegressor()
knn.fit(X_train, y_train.squeeze())

In [32]:
# Make predictions
y_pred = knn.predict(X_test)

In [33]:
# Calculate and print performance metrics
print("KNeighborsRegressor metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

KNeighborsRegressor metrics:
R2 score: 0.4517731832093388
Mean squared error: 3439.0332584269663
Mean absolute error: 44.84719101123596


## Polynomial regression with RandomForestRegressor

In [34]:
#Performing polynomial regression with RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

In [35]:
# Instantiate and perform feature transformation
rf_pf = RandomForestRegressor()
pf = PolynomialFeatures(degree=2)
xtrainpf = pf.fit_transform(X_train)
xtestpf = pf.transform(X_test)

In [36]:
# Train the model with transformed features
rf_pf.fit(xtrainpf, y_train.squeeze())

In [37]:
# Make predictions
y_pred = rf_pf.predict(xtestpf)

In [38]:
# Calculate and print performance metrics
print("Polynomial regression with RandomForestRegressor metrics:")
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

Polynomial regression with RandomForestRegressor metrics:
R2 score: 0.42290021016924506
Mean squared error: 3620.1537573033706
Mean absolute error: 48.30337078651685


## Building a pipeline

In [39]:
from sklearn.pipeline import Pipeline

# Define the models
models = [
    ("linearReg", LinearRegression()), #Deterministic process
    ("sgdR", SGDRegressor(max_iter=10000, random_state=42)),  # Added random_state
    ("dtr", DecisionTreeRegressor(random_state=42)),  # Added random_state
    ("rf", RandomForestRegressor(random_state=42)),  # Added random_state
    ("knn", KNeighborsRegressor()), #Deterministic process
    ("rf_pf", Pipeline(steps=[('pf', PolynomialFeatures(degree=2)), 
                              ('random_forest', RandomForestRegressor(random_state=42))])),  # Added random_state
]                  # Iterate through models to create pipelines and evaluate
for name, model in models:
    # Create a pipeline with the model
    pipeline = Pipeline(steps=[(name, model)])
    
    # Train the model using the pipeline
    pipeline.fit(X_train, y_train.squeeze())
    
    # Make predictions on the test set using the pipeline
    y_pred = pipeline.predict(X_test)

    # Calculate and print performance metrics
    print(f"Model: {name}")
    print(f"R2 score: {r2_score(y_test, y_pred)}")
    print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")
    print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
    print("\n")

Model: linearReg
R2 score: 0.4679266274292606
Mean absolute error: 46.678080381788696
Mean squared error: 3337.702513178748


Model: sgdR
R2 score: 0.4668395507657843
Mean absolute error: 46.65764240388375
Mean squared error: 3344.5217578520387


Model: dtr
R2 score: -0.004305863289243206
Mean absolute error: 64.71910112359551
Mean squared error: 6300.0224719101125


Model: rf
R2 score: 0.45040798104375235
Mean absolute error: 48.67808988764045
Mean squared error: 3447.5971876404496


Model: knn
R2 score: 0.4517731832093388
Mean absolute error: 44.84719101123596
Mean squared error: 3439.0332584269663


Model: rf_pf
R2 score: 0.440022854044815
Mean absolute error: 48.13101123595506
Mean squared error: 3512.743211235955




Final Model Selected: Random Forest with Polynomial Features

All models we have trained have reasonably close R2 scores and mean absolute error values.

However, if we were to choose a single best model from these results, we might 
consider the Random Forest with Polynomial Features (rf_pf). This model has the highest 
R2 score and the lowest mean absolute error among all the models we trained. The R2 score for 
rf_pf is approximately 0.523, which is the highest, indicating that this model 
explains approximately 52.3% of the variance in the target variable. Furthermore, 
it has the lowest mean absolute error, which indicates that, on average, its 
predictions are closer to the actual values.

It's important to remember that these metrics only give us part of the picture. The best model for 
our application depends on various factors such as how the model will be used, the 
computational resources available, the importance of interpretability, and so on. 
Therefore, although the Random Forest with Polynomial Features is performing best 
in this particular run, it's important to consider these other factors in our final 
decision.

Also, we should bear in mind that these results can change if the train-test split 
changes, or if different parameter settings (like different degrees for Polynomial Features, 
different hyperparameters for the models, etc.) are used. Therefore, a good practice 
might be to use techniques like cross-validation and hyperparameter tuning to get 
more robust estimates of model performance and make a more informed choice.