In [None]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import clear_output, display, HTML
from ipywidgets import Button, Checkbox, Dropdown, HBox, IntSlider, Layout, Output, VBox, interact
from sklearn import datasets
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

We will be working with real breast cancer data from the [Wisconsin Diagnostic Breast Cancer Database (WDBC)](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic). We will replace the "malignant" and "benign" values in the 'Diagnosis' column with 0s and 1s. A 0 will indicate malignant and a 1 will indicate benign.

In [None]:
# Load the Breast Cancer dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target
features = cancer.feature_names

# Create a DataFrame for easier manipulation
df = pd.DataFrame(data=X, columns=features)
df['Diagnosis'] = y  # 0 for malignant, 1 for benign

features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'Diagnosis']

df = df[features]
# Display the first few rows of the dataframe
df.head()

In [None]:
df.describe().T

To remind ourselves of the distribution of these features for cancerous vs non cancerous cells, lets plot a correlation plot for each feature.

In [None]:
# Plotting correlation plots of a selection of independent variables against the dependent variable
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 10))
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Correlation Plots')

selected_features = features[:10]
for i, ax in enumerate(axes.flatten()):
    if i < len(selected_features):
        ax.scatter(df[selected_features[i]], df['Diagnosis'])
        ax.set_title(f'{selected_features[i]} vs Diagnosis')
        ax.set_xlabel(selected_features[i])
        ax.set_ylabel('Diagnosis')

plt.show()

We want to build a model that can accurately predict whether a cell is cancerous. First we will split our data into test and training data, and then use the training data to train a linear regression model.

In [None]:
# Split the data into training and testing sets
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

The code below allows you to explore potential linear regression models. You can select the features you want to include in the model by checking the boxes next to the feature names and select the lambda value for the LASSO and Ridge models. The code will then train a standard MLR model, a LASSO model, and a Ridge model. It will report the coefficients of each model and both the R^2 and the adjusted R^2 of the model's fit to the training data.

To the right, the code simultaneously estimates coefficients for a standard MLR model, a LASSO model, and a Ridge model using your specified lambda value and all features available in the dataset. It then reports the coefficients of each model and both the R^2 and the adjusted R^2 of the model's fit to the training data.

In [None]:
# Output widget for displaying model results
output = widgets.Output()

# Function to update the model based on selected features and alpha
def update_model(button):
    with output:
        clear_output(wait=True)  # Clear only the output in the output widget
        selected_features = [checkbox.description for checkbox in checkboxes if checkbox.value]
        all_features = [checkbox.description for checkbox in checkboxes]  # All features

        # Fit models with selected and all features
        def fit_models(features):
            X_scaled = scaler.fit_transform(X_train[features])
            results = {}
            n = len(y_train)  # Number of observations
            p = len(features)  # Number of predictors
            for name, model in models.items():
                fitted_model = model.fit(X_scaled, y_train)
                y_pred = fitted_model.predict(X_scaled)
                coef = pd.Series(fitted_model.coef_, index=features)
                r2 = r2_score(y_train, y_pred)
                adjusted_r2 = 1 - ((1-r2) * (n-1) / (n-p-1))
                num_params = sum(coef != 0)  # Count non-zero coefficients
                results[name] = {'Coef': coef, 'R^2': r2, 'Adjusted R^2': adjusted_r2, 'Num Non-zero Params': num_params}
            return results

        scaler = StandardScaler()
        models = {
            'Linear Regression': LinearRegression(),
            'Ridge': Ridge(alpha=alpha_slider.value),
            'Lasso': Lasso(alpha=alpha_slider.value)
        }

        # Results with selected features
        selected_results = fit_models(selected_features) if selected_features else {}
        # Results with all features
        all_results = fit_models(all_features)

        # Create DataFrames to display coefficients
        selected_coefs_df = pd.DataFrame({model_name: info['Coef'] for model_name, info in selected_results.items()})
        all_coefs_df = pd.DataFrame({model_name: info['Coef'] for model_name, info in all_results.items()})

        # Display results and metrics
        left_metrics = "<br>".join(
            f"{name} with Selected Features: <br> Training R^2 = {info['R^2']:.3f} | Training Adjusted R^2 = {info['Adjusted R^2']:.3f} | Num Params = {info['Num Non-zero Params']} <br>"
            for name, info in selected_results.items()
        )
        right_metrics = "<br>".join(
            f"{name} with All Features: <br> Training R^2 = {info['R^2']:.3f} | Training Adjusted R^2 = {info['Adjusted R^2']:.3f} | Num Params = {info['Num Non-zero Params']} <br>"
            for name, info in all_results.items()
        )

        display(HTML('<div style="display: flex; justify-content: space-between;">'
                     '<div style="width: 50%;"><h3>Selected Features Models</h3>{}<p>{}</p></div>'
                     '<div style="width: 50%;"><h3>All Features Models</h3>{}<p>{}</p></div>'
                     '</div>'.format(selected_coefs_df.to_html(), left_metrics, all_coefs_df.to_html(), right_metrics)))
        
# Create checkboxes for each feature
checkboxes = [widgets.Checkbox(value=False, description=feature, disabled=False) for feature in features[:10]]

# Slider for alpha value
alpha_slider = widgets.FloatSlider(
    value=0.01,
    min=0.01,
    max=2.0,
    step=0.01,
    description='Lambda:',
    disabled=False
)

# Widget container for checkboxes
checkbox_container = widgets.VBox(children=checkboxes)

# Button to update model
button = widgets.Button(description="Update Model")
button.on_click(update_model)  # Connect the button to the update_model function

# Display all widgets and the separate output widget for model results
display(checkbox_container, alpha_slider, button, output)


Discussion Questions:
- How does a model with all features compare to a model with only the features you selected?
- When estimating parameterse for a LASSO model, which features often get set to zero? Why?

Assignment Questions:
- Please answer question 9 in the assignment

Choose a model (MLR/LASSO/Ridge and set of independent variables) that you think performs reasonably well. We will now perform 10-fold cross validation to evaluate your model and tune the lambda value (if relevant).

The code below will perform 10-fold cross validation on your chosen model and calculate the adjusted R^2 for all 10 folds. It will repeat this 10-fold cross validation process for different lambda values and plot the adjusted R^2 for each lambda value. Select the range of lambda values you want to explore and see how increasing lambda impacts model performance.

In [None]:
# Output widget for displaying model results
output = widgets.Output()

# Define a function to calculate adjusted R^2
def adjusted_r2_score(y_true, y_pred, p):
    n = len(y_true)
    r2 = r2_score(y_true, y_pred)
    return 1 - ((1-r2) * (n-1)) / (n-p-1)

# Custom scorer for cross-validation
def custom_adjusted_r2_scorer(model, X, y_true):
    y_pred = model.predict(X)
    # If model is Lasso, count non-zero coefficients
    if isinstance(model, Lasso):
        non_zero = np.sum(model.coef_ != 0)
        return adjusted_r2_score(y_true, y_pred, non_zero)
    else:
        return adjusted_r2_score(y_true, y_pred, X.shape[1])

# Modified function to perform cross-validation
def perform_cv(X, y, model):
    cv_scores = cross_val_score(model, X, y, cv=10, scoring=custom_adjusted_r2_scorer)
    return cv_scores

# Function to update results based on user input
def update_results(button):
    with output:
        clear_output(wait=True)
        selected_features = [checkbox.description for checkbox in checkboxes if checkbox.value]
        model_type = model_dropdown.value
        lambda_start = lambda_start_input.value
        lambda_end = lambda_end_input.value
        num_lambdas = lambda_count_input.value
        if lambda_start >= lambda_end:
            print("Lambda Start must be less than Lambda End.")
            return
        if lambda_start < 0 or lambda_end < 0:
            print("Lambda values must be positive.")
            return
        if selected_features:
            X = X_train[selected_features].values
            y = y_train
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            lambdas = np.linspace(lambda_start, lambda_end, num_lambdas)
            all_scores = []

            mean_scores = []  # List to hold mean scores for each lambda

            for alpha in lambdas:
                if model_type == 'MLR':
                    model = LinearRegression()
                elif model_type == 'Ridge':
                    model = Ridge(alpha=alpha)
                elif model_type == 'Lasso':
                    model = Lasso(alpha=alpha)

                scores = perform_cv(X_scaled, y, model)
                all_scores.extend([(alpha, score) for score in scores])
                mean_scores.append((alpha, np.mean(scores)))  # Calculate mean score for current alpha

            # Prepare data for plotting
            alphas, scores = zip(*all_scores)
            mean_alphas, mean_adjusted_r2 = zip(*mean_scores)

            # Plotting
            plt.figure(figsize=(8, 4))
            plt.scatter(alphas, scores, alpha=0.7, label='Adjusted R^2 for each fold')
            plt.plot(mean_alphas, mean_adjusted_r2, 'r-', label='Mean Adjusted R^2')  # Mean scores in red line
            plt.xlabel('Lambda')
            plt.ylabel('Adjusted R^2')
            plt.title('Cross-Validation Results for Different Lambdas')
            plt.legend()
            plt.grid(True)
            plt.show()
        else:
            print("Please select at least one feature.")

# Create checkboxes for each feature
checkboxes = [widgets.Checkbox(value=False, description=feature, disabled=False) for feature in X_train.columns]

# Dropdown for model selection
model_dropdown = widgets.Dropdown(
    options=['MLR', 'Ridge', 'Lasso'],
    value='MLR',
    description='Model:',
    disabled=False,
)

# Input fields for lambda values
lambda_start_input = widgets.FloatText(value=0.001, description='λ Start:')
lambda_end_input = widgets.FloatText(value=1, description='λ End:')
lambda_count_input = widgets.IntText(value=50, description='Number of λs:', style={'description_width': 'initial'})

# Button to update model
button = widgets.Button(description="Run 10-fold Cross-Validation", layout=Layout(width='20%'))
button.on_click(update_results)  # Connect the button to the update_results function

# Display all widgets and the output widget
display(*checkboxes, model_dropdown, lambda_start_input, lambda_end_input, lambda_count_input, button, output)


Discussion Question:
- Predict how your model will perform on the test data. What are you basing your predictions on?

Assignment Questions:
- Please answer questions 10 - 13 in the assignment

Now you have a model that you think will perform well on the test data. Let's test it out! The code below will use your chosen model and lambda value to predict the test data. It will then calculate the R^2 and adjusted R^2 of the model's fit to the test data.

In [None]:
# Output widget for displaying model results
output = widgets.Output()

# Define a function to calculate adjusted R^2
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1-r2) * (n-1) / (n-p-1)

# Custom scorer for cross-validation
def custom_adjusted_r2_scorer(model, X, y_true):
    y_pred = model.predict(X)
    return adjusted_r2_score(y_true, y_pred, len(y_true), X.shape[1])

# Initialize the scaler
scaler = StandardScaler()

# Function to evaluate model on test data
def evaluate_on_test(button):
    with output:
        if lambda_input.value < 0:
            print("Lambda must be positive.")
            return
        clear_output(wait=True)
        selected_features = [checkbox.description for checkbox in checkboxes if checkbox.value]
        model_type = model_dropdown.value
        if selected_features:
            X_train_scaled = scaler.fit_transform(X_train[selected_features])
            X_test_scaled = scaler.transform(X_test[selected_features])
            if model_type == 'MLR':
                model = LinearRegression()
            elif model_type == 'Ridge':
                model = Ridge(alpha=lambda_input.value)
            elif model_type == 'Lasso':
                model = Lasso(alpha=lambda_input.value)

            # Train the model
            model.fit(X_train_scaled, y_train)

            # Predict on test data
            y_pred = model.predict(X_test_scaled)

            # Calculate adjusted R^2
            n = len(y_test)
            p = len(selected_features)
            adj_r2 = adjusted_r2_score(y_test, y_pred, n, p)

            print(f'Adjusted R^2 on Test Data with λ = {lambda_input.value}: {adj_r2:.3f}')
        else:
            print("Please select at least one feature.")

lambda_input = widgets.FloatText(value=0.001, description='λ:')

# Button to evaluate model
button = widgets.Button(description="Evaluate Model on Test Data", layout=Layout(width='20%'))
button.on_click(evaluate_on_test)  # Connect the button to the evaluate_on_test function

# Display all widgets and the output widget
display(*checkboxes, model_dropdown, lambda_input, button, output)


Discussion Question:
- How did your model performance compare to your prediction? Why do you think your prediction was aligned or misaligned?
- How did regularization affect your training adjusted R^2? How did it affect the cross-validation adjusted R^2?