In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.calibration import cross_val_predict
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.pipeline import make_pipeline

In [47]:
# Load diabetes dataset
X, y = load_diabetes(return_X_y=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Performing cross-validation and showing the results:

In [48]:
# Define the degrees of the polynomial
degrees = np.arange(0, 9)

# Lists to store the results
cross_val_scores = []
r2_values = []
mae_values = []
mape_values = []

r2_mean_values = []
mae_mean_values = []
mape_mean_values = []
r2_std_values = []
mae_std_values = []
mape_std_values = []

for degree in degrees:
    # Get the polynomial features
    # Create a pipeline that creates polynomial features, then applies linear regression
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

    # Calculate the cross-validation score
    score = cross_val_score(model, X_test, y_test, cv=5).mean()
    cross_val_scores.append(score)
    
    # Calculate the R-Squared and MAE metrics
    y_pred = cross_val_predict(model, X, y, cv=5)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    
    r2_values.append(r2)
    mae_values.append(mae)
    mape_values.append(mape)

    # Add results for each degree to the lists
    r2_mean_values.append(np.mean(r2_values))
    mae_mean_values.append(np.mean(mae_values))
    mape_mean_values.append(np.mean(mape_values))
    r2_std_values.append(np.std(r2_values))
    mae_std_values.append(np.std(mae_values))
    mape_std_values.append(np.std(mape_values))

# Create a DataFrame for the results
cv_results_df = pd.DataFrame({
    'Degree': degrees,
    'Cross validation scores': cross_val_scores,
    'R-Squared': r2_values,
    'MAE': mae_values,
    'MAPE': mape_values,
    'R-Squared_mean': r2_mean_values,
    'MAE_mean': mae_mean_values,
    'MAPE_mean': mape_mean_values,
    'R-Squared_std': r2_std_values,
    'MAE_std': mae_std_values,
    'MAPE_std': mape_std_values
})

print(cv_results_df)


   Degree  Cross validation scores   R-Squared         MAE      MAPE   
0       0                -0.139139   -0.008824   66.039250  0.623684  \
1       1                 0.225655    0.495322   44.274856  0.394893   
2       2               -84.353812    0.410853   46.602887  0.402750   
3       3                -6.180406 -172.927827  340.295172  2.338418   
4       4                -6.160489  -71.859940  303.102402  2.453773   
5       5                -6.160743  -68.544073  295.638158  2.405314   
6       6                -6.164401  -68.611760  295.585132  2.405045   
7       7                -6.161375  -68.611367  295.582727  2.405035   
8       8                -6.147353  -68.609803  295.558540  2.404842   

   R-Squared_mean    MAE_mean  MAPE_mean  R-Squared_std     MAE_std  MAPE_std  
0       -0.008824   66.039250   0.623684       0.000000    0.000000  0.000000  
1        0.243249   55.157053   0.509289       0.252073   10.882197  0.114395  
2        0.299117   52.305664   0.47377

### Identification of the Best Model based on the Cross-Validation Results:

In [49]:
# Choose the degree with the highest cross-validation score
best_degree = degrees[np.argmax(cross_val_scores)]
print(f"The best degree is {best_degree}")

The best degree is 1


### Identification of the Best Model based on the R-squared, MAE, and MAPE metrics:

In [50]:
# Identify the best model based on R-Squared
best_degree_r_squared = degrees[np.argmax(r2_values)]
print(f"The best degree of R-Squared is {best_degree_r_squared}")

# Identify the best model based on MAE
best_degree_mae = degrees[np.argmin(mae_values)]
print(f"The best degree of MAE is {best_degree_mae}")

# Identify the best model based on MAPE
best_degree_mape = degrees[np.argmin(mape_values)]
print(f"The best degree of MAPE is {best_degree_mape}")

The best degree of R-Squared is 1
The best degree of MAE is 1
The best degree of MAPE is 1
