In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

# Loading the dataset
diabetes = load_diabetes(as_frame=True, scaled=True)

X_train, X_temp, y_train, y_temp = train_test_split(
    diabetes.data, diabetes.target, test_size=0.4, random_state=42
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

y_valid_pred = linear_reg.predict(X_valid)
y_test_pred = linear_reg.predict(X_test)

r2_linear = r2_score(y_valid, y_valid_pred)
mae_linear = mean_absolute_error(y_valid, y_valid_pred)

poly_reg_bmi = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False), LinearRegression()
)

X_train_bmi = X_train.iloc[:, 2].values.reshape(-1, 1)
poly_reg_bmi.fit(X_train_bmi, y_train)

X_valid_bmi = X_valid.iloc[:, 2].values.reshape(-1, 1)

# Prediction on validation
y_valid_pred_poly_bmi = poly_reg_bmi.predict(X_valid_bmi)
y_test_pred_poly_bmi = poly_reg_bmi.predict(X_test.iloc[:, 2].values.reshape(-1, 1))

# Calculating R-squared and MAE
r2_poly_bmi = r2_score(y_valid, y_valid_pred_poly_bmi)
mae_poly_bmi = mean_absolute_error(y_valid, y_valid_pred_poly_bmi)

# Creating an polynomial regression model for all variables
poly_reg_all = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False), LinearRegression()
)

poly_reg_all.fit(X_train, y_train)

# Predict on validation and test sets
y_valid_pred_poly_all = poly_reg_all.predict(X_valid)
y_test_pred_poly_all = poly_reg_all.predict(X_test)

# Calculating R-squared and MAE for polynomial regression on all variables
r2_poly_all = r2_score(y_valid, y_valid_pred_poly_all)
mae_poly_all = mean_absolute_error(y_valid, y_valid_pred_poly_all)

# Calculating MAPE
mape_linear = np.mean(np.abs((y_valid - y_valid_pred) / y_valid)) * 100

# Calculating MAPE for polynomial regression on BMI
mape_poly_bmi = np.mean(np.abs((y_valid - y_valid_pred_poly_bmi) / y_valid)) * 100

# Comparison 
print("Multivariate Linear Regression:")
print(f"R-squared: {r2_linear:.4f}")
print(f"MAE: {mae_linear:.4f}")
print(f"MAPE: {mape_linear:.4f}%\n")

print("Polynomial Regression (2nd Degree) on BMI:")
print(f"R-squared: {r2_poly_bmi:.4f}")
print(f"MAE: {mae_poly_bmi:.4f}")
print(f"MAPE: {mape_poly_bmi:.4f}%\n")

print("Multivariate Polynomial Regression (2nd Degree) on All Variables:")
print(f"R-squared: {r2_poly_all:.4f}")
print(f"MAE: {mae_poly_all:.4f}\n")

n_params_linear = len(linear_reg.coef_)
n_params_poly_bmi = len(poly_reg_bmi.named_steps["linearregression"].coef_)
n_params_poly_all = len(poly_reg_all.named_steps["linearregression"].coef_)

print("Number of Parameters:")
print(f"Multivariate Linear Regression: {n_params_linear}")
print(f"Polynomial Regression (2nd Degree) on BMI: {n_params_poly_bmi}")
print(f"Multivariate Polynomial Regression (2nd Degree) on All Variables: {n_params_poly_all}")

# Choose the best model for deployment
# I'll consider the model with the highest R-squared value (because it fits best for the validation data)
best_model = None
if r2_linear >= r2_poly_bmi and r2_linear >= r2_poly_all:
    best_model = "Multivariate Linear Regression"
elif r2_poly_bmi >= r2_linear and r2_poly_bmi >= r2_poly_all:
    best_model = "Polynomial Regression (2nd Degree) on BMI"
else:
    best_model = "Multivariate Polynomial Regression (2nd Degree) on All Variables"

print(f"Best Model for Deployment: {best_model}")


Multivariate Linear Regression:
R-squared: 0.5810
MAE: 38.2213
MAPE: 34.8018%

Polynomial Regression (2nd Degree) on BMI:
R-squared: 0.3623
MAE: 48.9093
MAPE: 44.2695%

Multivariate Polynomial Regression (2nd Degree) on All Variables:
R-squared: 0.4176
MAE: 47.2770

Number of Parameters:
Multivariate Linear Regression: 10
Polynomial Regression (2nd Degree) on BMI: 2
Multivariate Polynomial Regression (2nd Degree) on All Variables: 65
Best Model for Deployment: Multivariate Linear Regression
