In [1]:
from IPython.display import display

In [2]:
from pdstools.explanations import Explanations

import datetime
import polars as pl

## Aggregate data exported from infinity

If this step has not already been done, you need to aggregate the explanation data exported from infinity. Unless otherwise specified, the resulting aggrgeated data will be stored in the `.tmp/aggregate/data` directory.

This step can be skipped if you have already aggregated the data.

In [3]:
explanations = Explanations(
    data_folder='../../data/explanations/',
    model_name='AdaptiveBoostCT',
    to_date=datetime.date(2025,3,28)
)
explanations.aggregates.generate()

## Simple Data Exploration
For quick overview of the data, you can use the ExplanationsExplorer class. It provides a simple interface to select a context and then plot the corrsponding predictor contribution plots.

### Selecting a context
Select the desired context from the list available on the right.
* Selecting 'Any' means the plots will display and aggegation across all contexts.
* Selecting a specific context will result in plots which aggregate the data for that context only.

If you have a very large list of possible contexts, you can filter the list on the right by selecting specific context keys from the comboboxes on the left.

In [4]:
explanations.explorer.display()

GridBox(children=(HTML(value='<h3>Select Context Filters</>', layout=Layout(width='auto')), HTML(value='<h3>Co…

## Plotting the explanation data

The method shown below can be used to plot the explanation data for context selected above.
* The first plot shows the top 10 predictors sorted by their average contributions to predictions, the number can be changed by passing a different value to the `top_n` parameter.
* The subsequent plots show the avergae prediction contributions for the different values of top predictors.
    * Numeric predictor values are binned into deciles.
    * Categorical predictors are sorted and limited to the top 10 highest contributing values.

Note that the number of top predictors and top values can be changed by passing different values for the `top_n` and `top_k` parameters, respectively.

Additionally, if you are interested seeing the least contribution predictors, the `descending` parameter can be set to `False`. This will plot the least contributing predictors instead of the most contributing ones.

In [5]:
plots = explanations.explorer.plot_contributions(top_n = 5, top_k = 5)

No context selected, plotting overall contributions.


## Advaced Data Exploration
For more advanced data explortation you can directly use the ExplanationsDataLoader and ExplanationsDataLoader classes. These classes provide more flexibility in how the data is loaded and processed. Allowing you to inspect the data before plotting.

In [7]:
data = explanations.data_loader

In [8]:
top_predictors = data.get_top_n_predictor_contribution_overall(top_n = 5, remaining=False)
top_predictors

partition,predictor_name,predictor_type,contribution,contribution_abs,contribution_weighted,contribution_weighted_abs,frequency,contribution_min,contribution_max
str,str,str,f64,f64,f64,f64,i64,f64,f64
"""whole_model""","""pyName""","""SYMBOLIC""",-0.021191,0.021255,-9.6e-05,9.7e-05,50000,-0.044283,0.029203
"""whole_model""","""Age""","""NUMERIC""",-0.011306,0.011738,-9.2e-05,9.5e-05,50000,-0.034704,0.023485
"""whole_model""","""Occupation""","""SYMBOLIC""",-0.008904,0.010193,-1.7e-05,1.9e-05,50000,-0.032185,0.06411
"""whole_model""","""CustomerName""","""SYMBOLIC""",-0.005215,0.005814,-4.2977e-07,5.4566e-07,50000,-0.024571,0.029192
"""whole_model""","""NumX""","""NUMERIC""",-0.003837,0.005153,-3.1e-05,4.2e-05,50000,-0.025888,0.029391


We can get the top predictors and inspect their most influential values

In [9]:
predictors = top_predictors.select(pl.col('predictor_name')).unique().to_series().to_list()
data.get_top_k_predictor_value_contribution_overall(predictors=predictors, top_k = 5, remaining=False)

partition,predictor_name,predictor_type,bin_order,bin_contents,contribution,contribution_abs,contribution_weighted,contribution_weighted_abs,frequency,contribution_min,contribution_max,sort_column,sort_value
str,str,str,i64,str,f64,f64,f64,f64,i64,f64,f64,str,f64
"""whole_model""","""Age""","""NUMERIC""",0,"""MISSING""",-0.013647,0.015131,-0.000503,0.000557,1842,-0.034704,0.023485,"""bin_order""",0.0
"""whole_model""","""Age""","""NUMERIC""",4,"""[38.000:43.000]""",-0.012779,0.013046,-0.001231,0.001257,4816,-0.026804,0.014212,"""bin_order""",4.0
"""whole_model""","""Age""","""NUMERIC""",5,"""[43.000:48.000]""",-0.013108,0.013126,-0.001263,0.001264,4816,-0.026145,0.006043,"""bin_order""",5.0
"""whole_model""","""Age""","""NUMERIC""",6,"""[48.000:53.000]""",-0.012977,0.012984,-0.00125,0.001251,4816,-0.027359,0.007416,"""bin_order""",6.0
"""whole_model""","""Age""","""NUMERIC""",7,"""[53.000:58.000]""",-0.012498,0.012541,-0.001204,0.001208,4816,-0.025815,0.009732,"""bin_order""",7.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""whole_model""","""pyName""","""SYMBOLIC""",5,"""P19""",-0.02322,0.02324,-0.001171,0.001172,2522,-0.035388,0.008371,"""contribution""",0.02322
"""whole_model""","""pyName""","""SYMBOLIC""",4,"""P15""",-0.023321,0.023321,-0.001183,0.001183,2536,-0.034834,-0.003991,"""contribution""",0.023321
"""whole_model""","""pyName""","""SYMBOLIC""",3,"""P3""",-0.023462,0.023462,-0.001197,0.001197,2550,-0.034775,-0.006639,"""contribution""",0.023462
"""whole_model""","""pyName""","""SYMBOLIC""",15,"""P18""",-0.024462,0.024462,-0.001208,0.001208,2469,-0.03746,-0.009315,"""contribution""",0.024462


Let's repeat the same again, but this time we will inspect a specific context, instead of the entire model.

In [10]:
import random
context_info = random.choice(data.get_unique_contexts_list())
print('Selected random context: \n')
for key, value in context_info.items():
    print(f'{key}: {value}')
top_predictors_for_selected_context = data.get_top_n_predictor_contribution_by_context(context=context_info, top_n=5, remaining=False)
top_predictors_for_selected_context


Selected random context: 

pyChannel: PegaBatch
pyDirection: E2E Test
pyGroup: E2E Test
pyIssue: Batch
pyName: P2


partition,predictor_name,predictor_type,contribution,contribution_abs,contribution_weighted,contribution_weighted_abs,frequency,contribution_min,contribution_max
str,str,str,f64,f64,f64,f64,i64,f64,f64
"""{""partition"":{""pyChannel"":""Peg…","""pyName""","""SYMBOLIC""",-0.021438,0.021438,-0.001949,0.001949,2461,-0.033658,-0.001903
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",-0.010652,0.010948,-8.6e-05,8.8e-05,2461,-0.025554,0.019359
"""{""partition"":{""pyChannel"":""Peg…","""Occupation""","""SYMBOLIC""",-0.008352,0.009558,-1.6e-05,1.8e-05,2461,-0.024913,0.040308
"""{""partition"":{""pyChannel"":""Peg…","""CustomerName""","""SYMBOLIC""",-0.004549,0.005229,-4.602e-07,5.9636e-07,2461,-0.018731,0.024085
"""{""partition"":{""pyChannel"":""Peg…","""NumX""","""NUMERIC""",-0.003242,0.004546,-2.6e-05,3.6e-05,2461,-0.016668,0.012681


In [11]:
predictors_for_selected_context = top_predictors_for_selected_context.select(pl.col('predictor_name')).unique().to_series().to_list()
data.get_top_k_predictor_value_contribution_by_context(predictors=predictors_for_selected_context, top_k=5, context=context_info, remaining=False)

partition,predictor_name,predictor_type,bin_order,bin_contents,contribution,contribution_abs,contribution_weighted,contribution_weighted_abs,frequency,contribution_min,contribution_max,sort_column,sort_value
str,str,str,i64,str,f64,f64,f64,f64,i64,f64,f64,str,f64
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",0,"""MISSING""",-0.014828,0.01534,-0.000506,0.000524,84,-0.025554,0.004663,"""bin_order""",0.0
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",4,"""[38.000:43.000]""",-0.011797,0.012285,-0.001141,0.001188,238,-0.021092,0.006446,"""bin_order""",4.0
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",6,"""[48.000:53.000]""",-0.011554,0.011554,-0.001117,0.001117,238,-0.018953,-0.003113,"""bin_order""",6.0
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",8,"""[59.000:64.000]""",-0.011352,0.011352,-0.001093,0.001093,237,-0.017332,-0.00378,"""bin_order""",8.0
"""{""partition"":{""pyChannel"":""Peg…","""Age""","""NUMERIC""",9,"""[64.000:71.000]""",-0.011313,0.011365,-0.001089,0.001094,237,-0.020122,0.002269,"""bin_order""",9.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""{""partition"":{""pyChannel"":""Peg…","""Occupation""","""SYMBOLIC""",2,"""Historic buildings inspector/c…",-0.01329,0.01329,-0.000459,0.000459,85,-0.021598,-0.002461,"""contribution""",0.01329
"""{""partition"":{""pyChannel"":""Peg…","""Occupation""","""SYMBOLIC""",11,"""Operational investment banker""",-0.013435,0.013435,-0.000317,0.000317,58,-0.021134,-0.001961,"""contribution""",0.013435
"""{""partition"":{""pyChannel"":""Peg…","""Occupation""","""SYMBOLIC""",29,"""Psychotherapist, child""",-0.01416,0.01416,-0.000253,0.000253,44,-0.022091,-0.003855,"""contribution""",0.01416
"""{""partition"":{""pyChannel"":""Peg…","""Occupation""","""SYMBOLIC""",44,"""Geneticist, molecular""",-0.016624,0.016624,-0.000257,0.000257,38,-0.024913,-0.010677,"""contribution""",0.016624
