In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

from calidhayte.calibrate import Calibrate

import shap
import matplotlib.pyplot as plt

plt.style.use('bmh')

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
np.random.seed(4)
x_df = pd.DataFrame()
x_df['x'] = pd.Series(np.random.rand(300))
x_df['a'] = pd.Series(np.random.rand(300))
x_df['b'] = pd.Series(np.random.rand(300))
x_df['c'] = pd.Series(np.random.rand(300))
coeffs = np.random.randn(4)

y_df = pd.DataFrame()
modded = x_df * coeffs

y_df['x'] = modded.sum(axis=1)
y_df['Fold'] = ([0]*60) + ([1]*60) + ([2]*60) + ([3]*60) + ([4]*60)

cal = Calibrate(
        x_df,
        y_df,
        target='x'
        )
cal.linreg()
cal.theil_sen()
cal.random_forest()

models = cal.return_models()

The least populated class in y has only 2 members, which is less than n_splits=5.


In [28]:
class ShapPlots:
    """
    Calculates shap values using the [shap]() package's KernelExplainer and
    plots them in a style that matches the rest of the module.
    """

    def __init__(self, x: pd.DataFrame, y: pd.DataFrame, pipeline: Pipeline):
        """
        Parameters
        ----------
        x : pd.DataFrame
            Independent variable(s) that are calibrated against `y`, the independent
            variable. Index should match `y`.
        y : pd.DataFrame
            Dependent variable used to calibrate the independent variables `x`.
            Index should match `x`.
        pipeline : Pipeline
            Prefitted scikit-learn pipeline used to calibrate data. Last step should
            be regressor, previous steps preprocess data
        
        Raises
        ------
        ValueError
            If `x` and `y` index don't match
        """
        if not x.sort_index().index.to_series().eq(
            y.sort_index().index.to_series()
        ).all():
            raise ValueError(
                'Index of x and y do not match. Output of Calibrate class '
                'in calidhayte should have matching indexes'
            )
        self.y: pd.DataFrame = y.sort_index()
        """
        Dependent variable used to calibrate the independent variables `x`.
        Index should match `x`.
        """
        self.pipeline = pipeline
        """
        Prefitted scikit-learn pipeline used to calibrate data. Last step should
        be regressor, previous steps preprocess data
        """
        self.x: pd.DataFrame = x.sort_index()
        """
        Independent variable(s) that are calibrated against `y`, the independent
        variable. Index should match `y`.
        """

    def __call__(self):
        """
        """
        shaps: pd.DataFrame = pd.DataFrame()
        for fold in self.pipeline.keys():
            fold_index = self.y[self.y.loc[:, 'Fold'] == fold].index
            x_data = pd.DataFrame(
                self.pipeline[fold][0:-1].transform(
                    self.x.loc[fold_index, :]
                ),
                index=self.x.loc[fold_index, :].index,
                columns=self.x.columns
            )
            explainer = shap.KernelExplainer(
                model=self.pipeline[fold][-1].predict,
                data=x_data,
                link='identity'
            )
            shaps_fold = pd.DataFrame(
                explainer.shap_values(x_data),
                index=x_data.index,
                columns=x_data.columns
            )
            shaps_fold['Fold'] = fold
            shaps = pd.concat(
                [
                    shaps,
                    shaps_fold
                    
                ]
            )
        print(shaps)
        
        
        


[This is a link](https://uhoh)

In [29]:
for key in models.keys():
    shapley = ShapPlots(x_df, y_df, models[key]['None']['x + a + b + c'])
    shapley()
    break

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

            x         a         b         c  Fold
0   -0.278434 -0.117119 -0.076039  0.012128     0
1   -0.015930  0.086103  0.139363  0.203220     0
2   -0.281970  0.009281  0.134900  0.095498     0
3   -0.120722 -0.231971 -0.015637 -0.081366     0
4   -0.110037 -0.190966 -0.056087 -0.191517     0
..        ...       ...       ...       ...   ...
235  0.244293 -0.131339  0.095616 -0.204153     3
236  0.176266 -0.154836 -0.124654  0.120692     3
237  0.082595 -0.145298  0.180265 -0.229416     3
238  0.295627  0.252997 -0.003423  0.111409     3
239 -0.202004  0.183220  0.185652  0.183143     3

[300 rows x 5 columns]
