In [205]:
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

### 1. Get the data, and run the train-test split with the starting code.

In [206]:
X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2. Running a Multivariate Linear Regression

In [207]:
mv_model = LinearRegression()

# Fitting the model to the train data 
mv_model.fit(X_train,y_train)

# Make predictions on the train set
y_pred_train = mv_model.predict(X_train)

# Make predictions on the testing set
y_pred_test = mv_model.predict(X_test)

### 3. Multivariate Polynomial Regression on the BMI feature

In [208]:
def create_polynomial_model(degree, X_train, X_test):
    
    # Use PolynomialFeatures to create polynomials as a new feature
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    
    X_train_poly = poly_features.fit_transform(X_train)   
    X_test_poly = poly_features.transform(X_test)
    
    # Create a linear model
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    
    return poly_model, poly_features, X_train_poly, X_test_poly

In [209]:
bmi_X_train = X_train['bmi'].values.reshape(-1,1)
bmi_X_test = X_test['bmi'].values.reshape(-1,1)
bmi_model, bmi_features, bmi_X_train, bmi_X_test = create_polynomial_model(degree=2, X_train=bmi_X_train, X_test=bmi_X_test)

In [210]:
# Make predictions on the train set
bmi_y_pred_train = bmi_model.predict(bmi_X_train)

# Make predictions on the testing set
bmi_y_pred_test = bmi_model.predict(bmi_X_test)

### 4. Multivariate Polynomial Regression

In [211]:
poly_model, poly_features, X_train_poly, X_test_poly = create_polynomial_model(degree=2, X_train=X_train, X_test=X_test)

In [212]:
# Make predictions on the train set
poly_y_pred_train = poly_model.predict(X_train_poly)

# Make predictions on the testing set
poly_y_pred_test = poly_model.predict(X_test_poly)

### 5. Compare the three models by looking at R-squared, MAPE and MAE

In [213]:
def calculate_metrics(y_test, y_pred_test):
    """
    Calculate the MAE and MAPE of the model.
    Inputs:
        y_test: the true y values of the test dataset
        y_pred_test: the predicted y values of the test dataset
    """
    print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred_test):.2f}')
    print(f'Mean Absolute Percentage Error: {(mean_absolute_percentage_error(y_test, y_pred_test)):.2f}')
    print(f'R-Squared: {(r2_score(y_test, y_pred_test)):.2f}')

In [214]:
# Calculate the R-squared, MAE and MAPE of the multivariate linear regression model
calculate_metrics(y_test, y_pred_test)

Mean Absolute Error: 42.79
Mean Absolute Percentage Error: 0.37
R-Squared: 0.45


In [215]:
# Calculate the R-squared, MAE and MAPE of the multivariate polynomial regression model on the BMI feature only
calculate_metrics(y_test, bmi_y_pred_test)

Mean Absolute Error: 52.38
Mean Absolute Percentage Error: 0.46
R-Squared: 0.23


In [216]:
# Calculate the R-squared, MAE and MAPE of the multivariate polynomial regression model on the BMI feature only
calculate_metrics(y_test, poly_y_pred_test)

Mean Absolute Error: 43.58
Mean Absolute Percentage Error: 0.38
R-Squared: 0.42


**R-Squared** 
<p> Measures how well the model captures the patterns in the data. The closer the value of R-squared
  is to 1, the better the model is at explaining and predicting expected values. On the other hand, a lower R-squared value may suggest that the data of the model is not varying at all.</p>

**Mean Absolute Percentage Error (MAPE)** 
<p> Measures the average percentage difference between the predicted and actual values. A lower MAPE indicates better accuracy. </p>

**Mean Absolute Error (MAE)** 
<p> Measures the average absolute difference between the predicted and actual values, and also a lower MAE indicates better accuracy. </p>

*I would choose the linear regression model because it has a lower MAE and a higher R-Squeared. However, it is also important to mention that in this case, the difference between the MAE and MAPE is not significant and the polynomial regression model could be used as well.*

### 6.1 How many parameters are we fitting for in linear regression and in the polynomial 

<p> In linear regression, the parameters are the features (independent variables) that will be used to predict the target (dependent variable). </p>

In [217]:
linear_params = mv_model.feature_names_in_
print("Parameters of the linear regression model:\n",linear_params,"\n") 
print("Total length:\n",len(linear_params))

Parameters of the linear regression model:
 ['age' 'sex' 'bmi' 'bp' 's1' 's2' 's3' 's4' 's5' 's6'] 

Total length:
 10


<p> In polynomial regression, the parameters are the features (independent variables) that will be used to predict the target (dependent variable) plus the degree of the polynomial, it would show the coefficients for each term in the polynomial depending on the degree, in this case as the degree is 2 the values would be represented like this (f1,f2,f3...fn, f1^2, f1*f2...f1*fn) </p>

In [218]:
poly_params = poly_features.get_feature_names_out()
print("Parameters of the polynomial regression model:\n",poly_params,"\n")
print("Total length:\n",len(poly_params))

Parameters of the polynomial regression model:
 ['age' 'sex' 'bmi' 'bp' 's1' 's2' 's3' 's4' 's5' 's6' 'age^2' 'age sex'
 'age bmi' 'age bp' 'age s1' 'age s2' 'age s3' 'age s4' 'age s5' 'age s6'
 'sex^2' 'sex bmi' 'sex bp' 'sex s1' 'sex s2' 'sex s3' 'sex s4' 'sex s5'
 'sex s6' 'bmi^2' 'bmi bp' 'bmi s1' 'bmi s2' 'bmi s3' 'bmi s4' 'bmi s5'
 'bmi s6' 'bp^2' 'bp s1' 'bp s2' 'bp s3' 'bp s4' 'bp s5' 'bp s6' 's1^2'
 's1 s2' 's1 s3' 's1 s4' 's1 s5' 's1 s6' 's2^2' 's2 s3' 's2 s4' 's2 s5'
 's2 s6' 's3^2' 's3 s4' 's3 s5' 's3 s6' 's4^2' 's4 s5' 's4 s6' 's5^2'
 's5 s6' 's6^2'] 

Total length:
 65


### 6.2. Which model would you choose for deployment, and why?

<p> Comparing the results of the MAE and MAPE metrics, I would choose the linear regression model, because it has a lower MAE and a higher R-Squeared, with a small difference with MAPE. However it is also important to mention that the difference between them is not significant and the polynomial regression model could be used as well. </p>