In [16]:
import shap
import explanations


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from utils import feature_names
import time

In [17]:
# This cell defines the functions we use to generate the data in our scenario

import numpy as np
import pandas as pd
import scipy.stats
import sklearn
import xgboost


class FixableDataFrame(pd.DataFrame):
    """Helper class for manipulating generative models."""

    def __init__(self, *args, fixed={}, **kwargs):
        self.__dict__["__fixed_var_dictionary"] = fixed
        super().__init__(*args, **kwargs)

    def __setitem__(self, key, value):
        out = super().__setitem__(key, value)
        if isinstance(key, str) and key in self.__dict__["__fixed_var_dictionary"]:
            out = super().__setitem__(key, self.__dict__["__fixed_var_dictionary"][key])
        return out


# generate the data
def generator(n, fixed={}, seed=0):
    """The generative model for our subscriber retention example."""
    if seed is not None:
        np.random.seed(seed)
    X = FixableDataFrame(fixed=fixed)

    # the number of sales calls made to this customer
    X["Sales calls"] = np.random.uniform(0, 4, size=(n,)).round()

    # the number of sales calls made to this customer
    X["Interactions"] = X["Sales calls"] + np.random.poisson(0.2, size=(n,))

    # the health of the regional economy this customer is a part of
    #X["Economy"] = np.random.uniform(0, 1, size=(n,))

    # the time since the last product upgrade when this customer came up for renewal
    #X["Last upgrade"] = np.random.uniform(0, 20, size=(n,))

    # how much the user perceives that they need the product
    X["Product need"] = X["Sales calls"] * 0.1 + np.random.normal(0, 1, size=(n,))

    # the fractional discount offered to this customer upon renewal
    X["Discount"] = ((1 - scipy.special.expit(X["Product need"])) * 0.5 + 0.5 * np.random.uniform(0, 1, size=(n,))) / 2

    # What percent of the days in the last period was the user actively using the product
    X["Monthly usage"] = scipy.special.expit(X["Product need"] * 0.3 + np.random.normal(0, 1, size=(n,)))

    # how much ad money we spent per user targeted at this user (or a group this user is in)
   # I X["Ad spend"] = (
    #    X["Monthly usage"] * np.random.uniform(0.99, 0.9, size=(n,)) + (X["Last upgrade"] < 1) + (X["Last upgrade"] < 2)
    #)

    X["Ad spend"] = (
        X["Monthly usage"] * np.random.uniform(0.99, 0.9, size=(n,))
    )

    # how many bugs did this user encounter in the since their last renewal
   # X["Bugs faced"] = np.array([np.random.poisson(v * 2) for v in X["Monthly usage"]])

    # how many bugs did the user report?
   # X["Bugs reported"] = (X["Bugs faced"] * scipy.special.expit(X["Product need"])).round()

    # did the user renew?
    X["Did renew"] = scipy.special.expit(
        7
        * (
            0.1 * X["Product need"]
            + 0.1 * X["Monthly usage"]
            #+ 0.1 * X["Economy"]
            + 0.1 * X["Discount"]
            + 0.1 * np.random.normal(0, 1, size=(n,))
            #+ 0.1 * (["Bugs faced"])
            + 0.1 * X["Sales calls"]
            + 0.1 * X["Interactions"]
           # + 0.1 * (X["Last upgrade"])
            + 0.1 * X["Ad spend"]
            - 0.45
        )
    )

    # in real life we would make a random draw to get either 0 or 1 for if the
    # customer did or did not renew. but here we leave the label as the probability
    # so that we can get less noise in our plots. Uncomment this line to get
    # noiser causal effect lines but the same basic results
    X["Did renew"] = scipy.stats.bernoulli.rvs(X["Did renew"])

    return X


def user_retention_dataset():
    """The observed data for model training."""
    n = 10000
    X_full = generator(n)
    y = X_full["Did renew"]
    X = X_full.drop(["Did renew", "Product need"], axis=1)
    return X, y


def fit_xgboost(X, y):
    """Train an XGBoost model with early stopping."""
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtest = xgboost.DMatrix(X_test, label=y_test)
    model = xgboost.train(
        {"eta": 0.001, "subsample": 0.5, "max_depth": 2, "objective": "reg:logistic"},
        dtrain,
        num_boost_round=200000,
        evals=((dtest, "test"),),
        early_stopping_rounds=20,
        verbose_eval=False,
    )
    return model

In [18]:
X, y = user_retention_dataset()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
model = RandomForestClassifier(max_depth=100, random_state=42)
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.7455

In [19]:
X

Unnamed: 0,Sales calls,Interactions,Discount,Monthly usage,Ad spend
0,2.0,2.0,0.317545,0.269396,0.246908
1,3.0,3.0,0.174127,0.722157,0.700239
2,2.0,2.0,0.196308,0.507010,0.473837
3,2.0,2.0,0.152589,0.694552,0.685110
4,2.0,2.0,0.059299,0.221702,0.204957
...,...,...,...,...,...
9995,2.0,3.0,0.169215,0.614662,0.606331
9996,2.0,2.0,0.127022,0.500035,0.456393
9997,3.0,3.0,0.237312,0.625581,0.581991
9998,0.0,1.0,0.172525,0.263055,0.241789


In [20]:
st = time.time()
print(type(y_test))
explanation_app = explanations.model_explanations(X_test,y_test,model,output='text')
et = time.time()
print(et-st)

"""if __name__ == '__main__':
    explanation_app.run_server(debug=True)"""
explanation_app


<class 'pandas.core.series.Series'>
0.17219996452331543


Unnamed: 0,Feature,Importance
0,Sales calls,0.156
1,Interactions,0.087
2,Discount,0.054
3,Monthly usage,0.044
4,Ad spend,0.043


In [21]:
st = time.time()
explainer = shap.Explainer(model, X_test)
shap_values = explainer(X_test,check_additivity=False)
et = time.time()
print(et-st)



77.57912969589233


In [22]:
attributions = shap_values.values.mean(axis=0)[:,0]
attributions = attributions.reshape(X_test.shape[1])
feature_names(X_test,attributions)


Unnamed: 0,Feature,Importance
2,Discount,0.005
0,Sales calls,0.003
1,Interactions,0.003
3,Monthly usage,0.003
4,Ad spend,0.001
