In [3]:
"""
Practical Lab 2: Multivariate Linear Regression, Non-Parametric Models and Cross-Validation

Objective:
To build a model that can best predict the risk of diabetes progression. This will be used as a screening tool to help physicians with identifying patients at risk.
The models we explore include:
1. Univariate polynomial regression models
2. Multivariate Polynomial models
3. Decision Trees
4. kNNs

Evaluation Metrics:
- R-squared (R²)
- Mean Absolute Percentage Error (MAPE)
- Mean Absolute Error (MAE)
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

"""
Part 1: Data Exploration and Preprocessing
"""

# Step 1: Load the Diabetes dataset
diabetes = datasets.load_diabetes()
df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target

# Step 2: Exploratory Data Analysis (EDA)
def explore_data(df):
    print("Dataset Overview:\n", df.head())
    print("\nSummary Statistics:\n", df.describe())
    print("\nMissing Values:\n", df.isnull().sum())
    
    # Visualizing data distribution
    df.hist(figsize=(12, 10))
    plt.show()
    
    # Correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title("Feature Correlation Matrix")
    plt.show()

explore_data(df)

# Step 3: Data Splitting
train_data, temp_data = train_test_split(df, test_size=0.25, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.6, random_state=42)

"""
Part 2: Univariate Polynomial Regression
"""

# Step 4: Select BMI as feature
bmi_train_x = train_data[['bmi']]
bmi_train_y = train_data['target']
bmi_val_x = val_data[['bmi']]
bmi_val_y = val_data['target']
bmi_test_x = test_data[['bmi']]
bmi_test_y = test_data['target']

def polynomial_regression(degree):
    poly = PolynomialFeatures(degree)
    train_x_poly = poly.fit_transform(bmi_train_x)
    val_x_poly = poly.transform(bmi_val_x)
    test_x_poly = poly.transform(bmi_test_x)
    model = LinearRegression()
    model.fit(train_x_poly, bmi_train_y)
    
    train_pred = model.predict(train_x_poly)
    val_pred = model.predict(val_x_poly)
    
    return {
        "degree": degree,
        "train_r2": r2_score(bmi_train_y, train_pred),
        "train_mae": mean_absolute_error(bmi_train_y, train_pred),
        "train_mape": mean_absolute_percentage_error(bmi_train_y, train_pred),
        "val_r2": r2_score(bmi_val_y, val_pred),
        "val_mae": mean_absolute_error(bmi_val_y, val_pred),
        "val_mape": mean_absolute_percentage_error(bmi_val_y, val_pred),
        "model": model,
        "poly": poly,
        "test_x_poly": test_x_poly
    }

# Step 5: Model Evaluation
results = [polynomial_regression(d) for d in range(6)]
results_df = pd.DataFrame(results)
print(results_df)

# Step 6: Select Best Model
best_model = results_df.loc[results_df['val_r2'].idxmax()]
print("Best Model:\n", best_model)

# Step 7: Test Best Model
test_poly = best_model['poly']
test_model = best_model['model']
bmi_test_x_poly = best_model['test_x_poly']
test_pred = test_model.predict(bmi_test_x_poly)

test_r2 = r2_score(bmi_test_y, test_pred)
test_mae = mean_absolute_error(bmi_test_y, test_pred)
test_mape = mean_absolute_percentage_error(bmi_test_y, test_pred)
print(f"Test R2: {test_r2}, Test MAE: {test_mae}, Test MAPE: {test_mape}")

"""
Part 3: Multivariate Models (Polynomial, Decision Tree, kNN)
"""

def train_model(model, train_x, train_y, val_x, val_y):
    model.fit(train_x, train_y)
    train_pred = model.predict(train_x)
    val_pred = model.predict(val_x)
    
    return {
        "train_r2": r2_score(train_y, train_pred),
        "train_mae": mean_absolute_error(train_y, train_pred),
        "train_mape": mean_absolute_percentage_error(train_y, train_pred),
        "val_r2": r2_score(val_y, val_pred),
        "val_mae": mean_absolute_error(val_y, val_pred),
        "val_mape": mean_absolute_percentage_error(val_y, val_pred),
        "model": model
    }

# Step 8: Train Multivariate Models
train_x, train_y = train_data.drop(columns=['target']), train_data['target']
val_x, val_y = val_data.drop(columns=['target']), val_data['target']
test_x, test_y = test_data.drop(columns=['target']), test_data['target']

poly2 = PolynomialFeatures(2)
poly3 = PolynomialFeatures(3)
train_x_poly2, val_x_poly2 = poly2.fit_transform(train_x), poly2.transform(val_x)
train_x_poly3, val_x_poly3 = poly3.fit_transform(train_x), poly3.transform(val_x)

poly_results = [
    train_model(LinearRegression(), train_x_poly2, train_y, val_x_poly2, val_y),
    train_model(LinearRegression(), train_x_poly3, train_y, val_x_poly3, val_y)
]

tree_results = [
    train_model(DecisionTreeRegressor(max_depth=3), train_x, train_y, val_x, val_y),
    train_model(DecisionTreeRegressor(max_depth=5), train_x, train_y, val_x, val_y)
]

knn_results = [
    train_model(KNeighborsRegressor(n_neighbors=3), train_x, train_y, val_x, val_y),
    train_model(KNeighborsRegressor(n_neighbors=5), train_x, train_y, val_x, val_y)
]

# Step 9: Final Model Comparison
df_results = pd.DataFrame(poly_results + tree_results + knn_results,
                          index=['Poly-2', 'Poly-3', 'Tree-3', 'Tree-5', 'kNN-3', 'kNN-5'])
print(df_results)


SyntaxError: invalid character '²' (U+00B2) (581137690.py, line 14)