# Combined Actuarial Neural Networks
Author: Alberto Gutierrez

This notebook demonstrates the implementation and evaluation of GLMS with Gradient Boosting Trees. The notebook also includes data preprocessing, model training, and performance evaluation using metrics such as Poisson Deviance.

## 1. Get Data
The freMTPL2freq dataset will be used. The data set is read into a Pandas dataframe.

In [None]:
import pandas as pd

data = pd.read_csv("./data/freMTPL2freq.csv")
data.info()

## 2. Analyze the Data

The sweetviz package is a great package to do a quick exploratory data analysis.

In [None]:
import sweetviz as sv

data_report = sv.analyze(data)
data_report.show_notebook()

## 3. Data Prep

This section defines the list of features, weight (exposure), and the target (response variable). It also splits the data into train and test datasets.

In [None]:
from sklearn.model_selection import train_test_split

features = ['Area','VehBrand','VehGas','Region','BonusMalus','VehPower','VehAge','DrivAge','Density']
weight = 'Exposure'
target = 'ClaimNb'

train_data, test_data = train_test_split(data, test_size=0.2, random_state=12345)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Display the results 
print("Training DataFrame:\n", train_data.shape) 
print("Testing DataFrame:\n", test_data.shape)

## 4. Modelling

We will be building various models including Null, GLM, GBM, GLM with GBM, Neural Net, CANN.


### 4.1. Null Model

The null model is simply the weighted average of the target, and the prediction is the weighted average multiplied by the weight.

In [None]:
average_prediction = train_data[target].sum()/train_data[weight].sum()
def null_predict(df):
    return df[weight]*average_prediction
def null_predict_std(df):
    return null_predict(df)/df[weight]

In [None]:
test_data['null_prediction'] = null_predict(test_data)
train_data['null_prediction'] = null_predict(train_data)

### 4.2. GLM

There are many GLM packages such as glum, py-glm, pyglmnet, statsmodel, H2O, etc. Here we will be using the H2O python package. So lets initiate the H2O local server instance.

In [None]:
import h2o
h2o.init()

Now we need to do the following:
1. Convert the Pandas dataframe to an H2O dataframe using
2. Convert all numeric column sto categorical
3. The log of exposure will be an input as an offset
4. Set lambda_ to 0 to fit basic GLM instead of regularized GLM (ElasticNet)
5. Train the model

In [None]:
from h2o.estimators import H2OGeneralizedLinearEstimator
import numpy as np

def h2o_preprocess(df):
    h2o_df = h2o.H2OFrame(df)

    def convert_to_categorical(cols):
        for col in cols:
            h2o_df[col]=h2o_df[col].asfactor()

    convert_to_categorical(features)
    h2o_df['offset'] = h2o_df[weight].log()
    return h2o_df

h2o_train = h2o_preprocess(train_data)
h2o_test = h2o_preprocess(test_data)

glm_model = H2OGeneralizedLinearEstimator(
    family="poisson", link="Log",
    lambda_= 0, 
    seed=123

)

glm_model.train(
    offset_column='offset',
    training_frame = h2o_train,
    validation_frame= h2o_test,
    x=features,
    y=target
)

Now get predicted values from the model

In [None]:
def glm_predict(df):
    preprocessed_df = h2o_preprocess(df)
    predictions = glm_model.predict(preprocessed_df).as_data_frame()
    return predictions['predict'].values
def glm_predict_std(df):
    predictions = glm_predict(df)/df[weight]
    return predictions

In [None]:
test_data['glm_prediction'] = glm_predict(test_data)
train_data['glm_prediction'] = glm_predict(train_data)

### 4.3. XGBoost


In [None]:
import xgboost as xgb

def xgb_preprocess(df):
    xgb_df = df.copy()
    # Convert object types to categorical
    for col in xgb_df.select_dtypes(include='object').columns:
        xgb_df[col] = xgb_df[col].astype('category')
    base_margin = np.log(xgb_df[weight])
    xgb_dfM = xgb.DMatrix(xgb_df[features], label=xgb_df[target], enable_categorical=True, base_margin=base_margin)
    return xgb_dfM

xgb_train = xgb_preprocess(train_data)
xgb_test = xgb_preprocess(test_data)

params = {
    'objective': 'count:poisson',  # For count data with Poisson distribution
    'booster': 'gbtree',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'nthread': 4,
    'learning_rate': 0.1,
    'seed': 123,
}

# Train the XGBoost model
watchlist = [(xgb_train, 'train'), (xgb_test, 'val')]
num_rounds = 200
xgb_model = xgb.train(params, xgb_train, num_rounds, evals=watchlist, early_stopping_rounds=10)

Now get the predicted values

In [None]:
def xgb_predict(df):
    xgb_df = xgb_preprocess(df)
    return xgb_model.predict(xgb_df)
def xgb_predict_std(df):
    return xgb_predict(df)/df[weight]

In [None]:
test_data['xgb_prediction'] = xgb_predict(test_data)
train_data['xgb_prediction'] = xgb_predict(train_data)

### 4.4. GLM-GBM


In [None]:
import xgboost as xgb

def glm_xgb_preprocess(df):
    xgb_df = df.copy()
    # Convert object types to categorical
    for col in xgb_df.select_dtypes(include='object').columns:
        xgb_df[col] = xgb_df[col].astype('category')
    base_margin = np.log(xgb_df['glm_prediction'])
    xgb_dfM = xgb.DMatrix(xgb_df[features], label=xgb_df[target], enable_categorical=True, base_margin=base_margin)
    return xgb_dfM

xgb_train = glm_xgb_preprocess(train_data)
xgb_test = glm_xgb_preprocess(test_data)

params = {
    'objective': 'count:poisson',  # For count data with Poisson distribution
    'booster': 'gbtree',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'nthread': 4,
    'learning_rate': 0.1,
    'seed': 123,
}

# Train the XGBoost model
watchlist = [(xgb_train, 'train'), (xgb_test, 'val')]
num_rounds = 200
glm_xgb_model = xgb.train(params, xgb_train, num_rounds, evals=watchlist, early_stopping_rounds=10)

In [None]:
def glm_xgb_predict(df):
    xgb_df = glm_xgb_preprocess(df)
    predictions = glm_xgb_model.predict(xgb_df)
    return predictions
def glm_xgb_predict_std(df):
    predictions = glm_xgb_predict(df)/df[weight]
    return predictions

In [None]:
test_data['glm_xgb_prediction'] = glm_xgb_predict(test_data)
train_data['glm_xgb_prediction'] = glm_xgb_predict(train_data)

### 4.5. Neural Network

We will be using pytorch (using the lightning API) to build and train our neural network.

We begin by defining our data class which does the following:
1. Initialise the instance based on the inputs: pandas dataframe (train and test), weight, target variables, the prior prediction value (for impleenting CANNs) and batch size.
2. Preprocess the data by standardizing the numeric variables and one-hot encoding the categorical variables and converting to a pytorch tensor dataset
3. Create dataloaders for the train and test set which will return batches from teh tensor during traiing and validation 

In [None]:
import os
# os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'

import pytorch_lightning as L
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class CustomDataModule(L.LightningDataModule):
    def __init__(self, train_data, test_data, features, target, weight, prior_prediction=None, batch_size=10000):
        super().__init__()
        self.nn_train = train_data.copy()
        self.nn_test = test_data.copy()
        self.features = features
        self.target = target
        self.weight = weight
        self.batch_size = batch_size
        self.prior_prediction = prior_prediction
        self.fit_transform_pipeline(self.nn_train)
        self.preprocess_data()

    def fit_transform_pipeline(self, data):
        # Identify categorical and numerical columns
        cat_cols = [col for col in data.select_dtypes(include=['object']).columns 
                   if col in self.features]
        num_cols = [col for col in data.select_dtypes(include=['int64', 'float64']).columns 
                   if col in self.features]

        # Create ColumnTransformer
        pipeline = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), num_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
            ]
        )
        self.pipeline = pipeline.fit(data)
        self.pipeline_features = num_cols + list(pipeline.named_transformers_['cat'].get_feature_names_out(cat_cols))

    def preprocess(self, data):
        transformed_data = pd.DataFrame(self.pipeline.transform(data), columns=self.pipeline_features)
        X = torch.tensor(transformed_data.values, dtype=torch.float32)
        y = torch.tensor(data[self.target].values, dtype=torch.float32)
        w = torch.tensor(data[self.weight].values, dtype=torch.float32)
        if self.prior_prediction:
            p = torch.tensor(data[self.prior_prediction].values, dtype=torch.float32)
        else:
            p = w
        dataset = TensorDataset(X, y, w, p)
        return X, y, w, p, dataset

    def preprocess_data(self):
        self.X_train, self.y_train, self.w_train, self.p_train, self.train_dataset = self.preprocess(self.nn_train)
        self.X_test, self.y_test, self.w_test, self.p_test, self.test_dataset = self.preprocess(self.nn_test)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=12, persistent_workers=True, pin_memory=True, drop_last=True)
    
    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=12, persistent_workers=True, pin_memory=True)


Now define the neural net model with an exponential activation as the final output that uses the Poisson Deviance loss function.
When this output is multiplied by the prior prediction (such as the GLM perediction) before being calculated in teh loss function, then the model becomes the CANN model.

In [None]:

from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint


class CANN(L.LightningModule):

    class PoissonDevianceLoss(nn.Module):
        def __init__(self):
            super().__init__()
        
        def forward(self, y_pred, y_true):
            y_true_e = y_true + 1e-16
            return torch.mean(2 * (y_true * torch.log(y_true_e / y_pred) - (y_true - y_pred)))

    class ExpActivation(nn.Module): 
        def forward(self, x): 
            return torch.exp(x)
        
    class ResBlock(nn.Module):
        def __init__(self, input_dim):
            super().__init__()
            self.fc1 = nn.Linear(input_dim, input_dim)
            self.fc2 = nn.Linear(input_dim, input_dim)
            self.dropout = nn.Dropout(p=0.2)
            
        def forward(self, x):
            residual = x
            out = F.relu(self.fc1(x))
            out = self.dropout(out)
            out = self.fc2(out)
            out = F.relu(out + residual)
            return out
        
    def __init__(self, input_dim):
        super().__init__()
        self.NeuralNet = nn.Sequential(
            *[self.ResBlock(input_dim) for _ in range(10)],
            nn.Linear(input_dim, 1), self.ExpActivation(),
        )
        # Manually set the weights of the final Linear layer to zero
        self.NeuralNet[-2].weight.data.fill_(0)
        self.NeuralNet[-2].bias.data.fill_(0)
        
        self.loss_fn = self.PoissonDevianceLoss()
        
    def forward(self, x, p=1):
        cann = self.NeuralNet(x).view(-1) * p
        return cann
    
    def training_step(self, batch, batch_idx):
        x, y, w, p = batch
        y_pred = self.forward(x, p)
        loss = self.loss_fn(y_pred, y)
        self.log('train_loss', loss, on_step=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y, w, p = batch
        y_pred = self.forward(x, p)
        loss = self.loss_fn(y_pred, y)
        self.log('val_loss', loss, on_step=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return optim.NAdam(self.parameters())

# Training setup and execution
def train_model(train_data, test_data, features, target, weight, prior_prediction=None, max_epochs=100, batch_size=10000):
    L.seed_everything(123, workers=True)
    # Initialize data module
    data_module = CustomDataModule(train_data, test_data, features, target, weight, 
                                   prior_prediction=prior_prediction, batch_size=batch_size)
    # Initialize model
    input_dim=data_module.X_train.size(dim=1)
    model = CANN(input_dim)
    
    # Early stopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss_epoch',
        patience=5,
        min_delta=1e-5,
        mode='min'
    )
    
    # Initialize trainer
    trainer = L.Trainer(
        max_epochs=max_epochs,
        accelerator='auto', 
        callbacks=[early_stopping],
        enable_progress_bar=True,
        deterministic=True
    )
    
    # Train the model
    trainer.fit(model, data_module)    
    return model, trainer, data_module


Now train the NN model.

In [None]:
nn_model, trainer, data_module = train_model(
    train_data=train_data,
    test_data=test_data,
    features=features,
    target=target,
    weight=weight
)

Now get the predicted values from the model.

In [None]:
def nn_predict(df):
    X, y, w, p, dataset = data_module.preprocess(df)
    nn_model.eval()
    predictions = nn_model(X).detach().numpy()*df[weight]
    return predictions
def nn_predict_std(df):
    predictions = nn_predict(df)/df[weight]
    return predictions

In [None]:
test_data['nn_prediction']=nn_predict(test_data)
train_data['nn_prediction']=nn_predict(train_data)

### 4.6. CANN
Using the same function and model class to train the model, set the prior_prediction parameter to the column name that contains the prior prediction, e.g. GLM prediction.

In [None]:
cann_model, cann_trainer, cann_data_module = train_model(
    train_data=train_data,
    test_data=test_data,
    features=features,
    target=target,
    weight=weight,
    prior_prediction='glm_prediction',
)

Get predicted values from model

In [None]:
def cann_predict(df):
    X, y, w, p, dataset = cann_data_module.preprocess(df)
    cann_model.eval()
    predictions = cann_model(X).detach().numpy()*df['glm_prediction']
    return predictions
def cann_predict_std(df):
    predictions = cann_predict(df)/df[weight]
    return predictions

In [None]:
test_data['cann_prediction']=cann_predict(test_data)
train_data['cann_prediction']=cann_predict(train_data)

## 5. Evaluate Models

### 5.1. Deviance & Gini

In [None]:
from model_diagnostics.scoring import decompose, PoissonDeviance
import pandas as pd
import numpy as np

data_eval = test_data.copy()
models = ['null_prediction','glm_prediction','xgb_prediction','glm_xgb_prediction','nn_prediction','cann_prediction']
model_prediction_functions = {
    'null_prediction': null_predict_std, 
    'glm_prediction': glm_predict_std,
    'xgb_prediction': xgb_predict_std,
    'glm_xgb_prediction': glm_xgb_predict_std,
    'nn_prediction': nn_predict_std,
    'cann_prediction': cann_predict_std,
}

PoissonDeviance_scoring = PoissonDeviance()

def gini_coefficient(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    lorenz_curve = np.cumsum(y_true_sorted) / np.sum(y_true_sorted)
    lorenz_curve = np.insert(lorenz_curve, 0, 0)
    gini = 1 - 2 * np.trapz(lorenz_curve, dx=1/len(y_true))
    return gini


model_eval_dict={}

for model in models:
    PoissonDeviance_score = PoissonDeviance_scoring(y_obs=data_eval[target],y_pred=data_eval[model])
    true_gini = gini_coefficient((data_eval[target]/data_eval[weight]).values, (data_eval[target]/data_eval[weight]).values)
    gini = gini_coefficient((data_eval[target]/data_eval[weight]).values, (data_eval[model]/data_eval[weight]).values)
    model_eval_dict[model]={
        'PoissonDeviance': PoissonDeviance_score, 
        'ActualAverageResponse': data_eval[target].sum()/data_eval[weight].sum(),
        'PredictedAverageResponse': data_eval[model].sum()/data_eval[weight].sum(),
        'ActualTotalResponse': data_eval[target].sum(),
        'PredictedTotalResponse': data_eval[model].sum(),
        'TrueGiniCoefficient': true_gini,
        'GiniCoefficient': gini,
        }
    
pd.DataFrame(model_eval_dict)

### 5.2. Reliability Plots

In [None]:
from model_diagnostics import config_context
from model_diagnostics.calibration import plot_marginal, plot_reliability_diagram


with config_context(plot_backend="plotly"):
    fig = plot_reliability_diagram(
        y_obs=data_eval[target].div(data_eval[weight], axis=0),
        y_pred=data_eval[models].div(data_eval[weight], axis=0),
        weights=data_eval[weight],
    )
fig.show(renderer="notebook")

### 5.3. Model Bias Plots

In [None]:
from model_diagnostics import config_context
from model_diagnostics.calibration import plot_bias


with config_context(plot_backend="plotly"):
    fig = plot_bias(
        y_obs=data_eval[target]/data_eval[weight],
        y_pred=data_eval[models].div(data_eval[weight], axis=0),
        weights=data_eval[weight],
        n_bins=30,
    )
fig.show(renderer="notebook")

### 5.4. Bias Plots by Feature

In [None]:
from model_diagnostics import config_context
from model_diagnostics.calibration import plot_bias

for col in features:
    with config_context(plot_backend="plotly"):
        fig = plot_bias(
            y_obs=data_eval[target]/data_eval[weight],
            y_pred=data_eval[models].div(data_eval[weight], axis=0),
            weights=data_eval[weight],
            feature=data_eval[col],
            n_bins=30,
        )
    fig.show(renderer="notebook")

### 5.5. Marginal/Partial Dependency Plots by Feature

In [None]:
from model_diagnostics import config_context
from model_diagnostics.calibration import compute_marginal
import plotly.graph_objects as go

h2o.no_progress()

for feature in features:
    fig = go.Figure(
        layout=dict( 
            title=f'Marginal Plots for feature {feature}', 
            xaxis_title=feature, 
            yaxis_title='Sum of Weights', 
            yaxis2=dict(title='Average Value', overlaying='y', side='right'), 
            barmode='group' 
        )
    )
    for i, model in enumerate(models):
        marginals = compute_marginal(
            y_obs=data_eval[target]/data_eval[weight],
            y_pred=data_eval[model]/data_eval[weight],
            X=data_eval,
            weights=data_eval[weight],
            feature_name=feature,
            # predict_function=model_prediction_functions[model], #this is for PDP plots
            # n_max=10000, #this is for PDP plots
            n_bins=30,
        )
        if i==0:
            fig.add_trace(go.Bar(x=marginals[feature], y=marginals["weights"],name='Weights'))
            fig.add_trace( go.Scatter(x=marginals[feature],y=marginals["y_obs_mean"],mode='lines+markers',name='y_obs_mean',yaxis='y2',line=dict(dash='dot')))
        fig.add_trace(go.Scatter(x=marginals[feature],y=marginals["y_pred_mean"],mode='lines+markers',name=model,yaxis='y2'))
        # fig.add_trace(go.Scatter(x=marginals[feature],y=marginals["partial_dependence"],mode='lines+markers',name=model,yaxis='y2')) #this is for PDP plots
    fig.show(renderer="notebook")  

### 6. Analyse GBM boosting component

#### 6.1. Distribution of boosting component

In [None]:
import plotly.figure_factory as ff

# Plot density graph using plotly
fig = ff.create_distplot([data_eval['glm_xgb_prediction'] / data_eval['glm_prediction']], group_labels=['XGB Boosting Component'])
fig.update_layout(
    title='Density Plot of XGB Boosting Component',
    xaxis_title='XGB Boosting Component',
    yaxis_title='Density',
    xaxis=dict(range=[0, 5])
)
fig.show()


#### 6.2. Data Profile Comparison 

In [None]:
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output


boosted_model = 'glm_xgb_prediction'
# Function to apply filters
def create_bins(df, feature):
    # Compute histogram'
    hist_data_orig, bin_edges_orig = np.histogram(df[feature], bins='auto')
    # Build x values
    x_values = np.array([(bin_edges_orig[i] + bin_edges_orig[i + 1]) / 2 for i in range(len(bin_edges_orig) - 1)])
    # Create a DataFrame with bin edges and x values
    bins_df = pd.DataFrame({'bin_start': bin_edges_orig[:-1], 'bin_end': bin_edges_orig[1:], 'x': x_values})
    bins_df['bin_start'] = bins_df['bin_start'].astype(float)
    return bins_df

def plot_graphs(fig, df, feature, bins, lower_threshold, upper_threshold, color):
    filtered_df = df[(df[boosted_model] / df['glm_prediction'] > lower_threshold) & (df[boosted_model] / df['glm_prediction'] <= upper_threshold)]
    if pd.api.types.is_numeric_dtype(df[feature]):
        filtered_df[feature] = filtered_df[feature].astype(float)
        # Merge with the original DataFrame
        merged_df = pd.merge_asof(filtered_df.sort_values(by=feature), bins[['bin_start', 'x']], left_on=feature, right_on='bin_start')
        # Group by x values and add traces
        grouped = merged_df.groupby('x')
        # Add traces
        fig.add_traces([
            go.Bar(x=grouped.size().index,y=grouped[weight].sum()/filtered_df[weight].sum(),name=f'Weights {lower_threshold} < x <= {upper_threshold}',opacity=0.3, marker=dict(color=color)),
            go.Scatter(x=grouped.size().index,y=grouped[target].sum()/grouped['glm_prediction'].sum(),
                mode='lines+markers',line=dict(dash='dot'),marker=dict(color=color),
                name=f'GLM AvE ({lower_threshold} < x <= {upper_threshold})', 
                yaxis='y2'
            ),
            go.Scatter(x=grouped.size().index,y=grouped[target].sum() / grouped[boosted_model].sum(),
                mode='lines+markers', marker=dict(color=color),
                name=f'GLM with Boosting AvE ({lower_threshold} < x <= {upper_threshold})', 
                yaxis='y2'
            )
        ])
    else:
        grouped_filtered = filtered_df.groupby(feature)[[target,'glm_prediction',boosted_model,weight]].sum().reset_index()
        fig.add_trace(go.Bar(x=grouped_filtered[feature],y=grouped_filtered[weight]/grouped_filtered[weight].sum(),marker=dict(color=color), name=f'Weights ({lower_threshold} < x <= {upper_threshold})', opacity=0.3))
        fig.add_trace(go.Scatter(x=grouped_filtered[feature],y=grouped_filtered[target]/grouped_filtered['glm_prediction'],mode='lines+markers',marker=dict(color=color),line=dict(dash='dot'),name=f'GLM AvE ({lower_threshold} < x <= {upper_threshold})',yaxis='y2'))
        fig.add_trace(go.Scatter(x=grouped_filtered[feature],y=grouped_filtered[target]/grouped_filtered[boosted_model],mode='lines+markers',marker=dict(color=color),name=f'GLM with Boosting AvE ({lower_threshold} < x <= {upper_threshold})',yaxis='y2'))
    return fig
# Function to create combined bar chart
def create_combined_bar_chart(feature, lower_1, upper_1, lower_2, upper_2):
    clear_output(wait=True)
    display(feature_selector, lower_slider_1, upper_slider_1, lower_slider_2, upper_slider_2, update_button)
    
    fig = go.Figure()
    bins = None
    if pd.api.types.is_numeric_dtype(data_eval[feature]):
        bins = create_bins(data_eval, feature)
    fig = plot_graphs(fig, data_eval, feature, bins, lower_1, upper_1, 'green')
    fig = plot_graphs(fig, data_eval, feature, bins, lower_2, upper_2, 'red')

    # Update layout for better visualization
    fig.update_layout(
        title=f'Distribution of {feature} (Subset 1 vs Subset 2) where subset is based on x=Booster Component',
        barmode='overlay',
        xaxis_title=feature,
        yaxis_title='Percent',
        yaxis2=dict(title='AvE Values', overlaying='y', side='right'), 
        legend_title='Dataset',
        bargap=0.2,
    )
    fig.show()

# Dropdown widget for feature selection
feature_selector = widgets.Dropdown(options=features,description='Feature:',disabled=False)

# Slider widgets for original dataset threshold selection
lower_slider_1 = widgets.FloatSlider(value=0, min=0, max=2, step=0.1,description='1 Lower Threshold:',continuous_update=False)
upper_slider_1 = widgets.FloatSlider(value=0.95, min=0, max=2, step=0.1,description='1 Upper Threshold:',continuous_update=False)

# Slider widgets for filtered dataset threshold selection
lower_slider_2 = widgets.FloatSlider(value=1.05, min=0, max=2, step=0.1,description='2 Lower Threshold:',continuous_update=False)
upper_slider_2 = widgets.FloatSlider(value=2, min=0, max=2, step=0.1,description='2 Upper Threshold:',continuous_update=False)

# Button to update plot
update_button = widgets.Button(description="Update Plot")

def on_button_click(b):
    create_combined_bar_chart(feature_selector.value, lower_slider_1.value, upper_slider_1.value, lower_slider_2.value, upper_slider_2.value)

update_button.on_click(on_button_click)

# Display widgets
display(feature_selector, lower_slider_1, upper_slider_1, lower_slider_2, upper_slider_2, update_button)


In [None]:
from model_diagnostics import config_context
from model_diagnostics.calibration import compute_marginal
import plotly.graph_objects as go

for feature in features:
    fig = go.Figure(
        layout=dict( 
            title=f'Marginal Plots for feature {feature}', 
            xaxis_title=feature, 
            yaxis_title='Sum of Weights', 
            yaxis2=dict(title='Average Value', overlaying='y', side='right'), 
            barmode='group' 
        )
    )
    marginals = compute_marginal(
        y_obs=data_eval[boosted_model]/data_eval[boosted_model],
        y_pred=data_eval[boosted_model]/data_eval['glm_prediction'],
        X=data_eval,
        weights=data_eval[weight],
        feature_name=feature,
        # predict_function=model_prediction_functions[boosted_model],
        # n_max=100000,
        n_bins=30,
    )
    fig.add_trace(go.Bar(x=marginals[feature], y=marginals["weights"], name='Weights'))
    fig.add_trace(go.Scatter(x=marginals[feature], y=marginals["y_obs_mean"], mode='lines+markers', name='y_obs_mean', yaxis='y2', line=dict(dash='dot')))
    fig.add_trace(go.Scatter(x=marginals[feature], y=marginals["y_pred_mean"],error_y=dict(type='data', array=marginals["y_pred_stderr"]*2), mode='lines+markers', name=boosted_model, yaxis='y2'))
    # fig.add_trace(go.Scatter(x=marginals[feature], y=marginals["partial_dependence"], mode='lines+markers', name="Partial Dependence", yaxis='y2'))
    fig.show(renderer="notebook")
