# Preparations

In [1]:

from pprint import pprint

import numpy
import pandas
import sklearn
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from pytolemaic.pytrust import SklearnTrustBase

from pytolemaic.utils.dmd import DMD
from pytolemaic.utils.general import GeneralUtils
from pytolemaic.utils.metrics import Metrics


In [2]:
# get dataset
from sklearn.datasets import load_wine
data = load_wine()
x = data.data
y = data.target
feature_names = ['feature #%d' % k for k in range(x.shape[1])]
print(x.shape, y.shape)


(178, 13) (178,)


In [3]:
rs = numpy.random.RandomState(0)
# let's add some missing values
nan_locs = numpy.ones(numpy.prod(x.shape))
nan_locs[rs.permutation(len(nan_locs))[:100]] = numpy.nan
nan_locs = nan_locs.reshape(x.shape)
x = x * nan_locs
print("number of missing valies in x:", numpy.sum(numpy.isnan(x)), "out of", numpy.prod(x.shape))

number of missing valies in x: 100 out of 2314


In [4]:
# create train/test sets
perm = rs.permutation(len(y))
ntrain = len(y)-60
xtrain, xtest = x[perm[:ntrain],:], x[perm[ntrain:],:]
ytrain, ytest = y[perm[:ntrain]], y[perm[ntrain:]]

In [5]:
# train RandomForest with imputation preprocess
#imputation preprocess is required for sensitivity to missing values

estimators = []
estimators.append(('Imputer', SimpleImputer()))
estimators.append(('Estimator', RandomForestClassifier(n_estimators=10, random_state=0)))
estimator = Pipeline(steps=estimators)
    
estimator.fit(xtrain, ytrain)

Pipeline(memory=None,
         steps=[('Imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('Estimator',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=0,
                               

In [6]:
# some definitions 

metric_of_interest = Metrics.recall.name

## set splitting strategy
splitter = 'shuffled'

# Init Pytrust object

In [7]:
    ## sample meta data (e.g. sample weight) - empty in this example
    sample_meta_train = None
    sample_meta_test = None

    # set the feature names names
    columns_meta = {DMD.FEATURE_NAMES: feature_names}

    pytrust = SklearnTrustBase(
        model=estimator,
        xtrain=xtrain, ytrain=ytrain,
        xtest=xtest, ytest=ytest,
        sample_meta_train=sample_meta_train, sample_meta_test=sample_meta_test,
        columns_meta=columns_meta,
        metric=metric_of_interest,
        splitter=splitter)



# Sensitivity Analysis Example

In [8]:

    sensitivity_report = pytrust.sensitivity_report()
    pprint(sensitivity_report.simplified_keys())


{'MISSING': {'META': {'N_FEATURES': 13,
                      'N_LOW': 5,
                      'N_NON_ZERO': 8,
                      'N_ZERO': 5},
             'SENSITIVITY': {'feature #0': 0.03508,
                             'feature #1': 0.03508,
                             'feature #10': 0.07016,
                             'feature #11': 0.0,
                             'feature #12': 0.12591,
                             'feature #2': 0.0,
                             'feature #3': 0.0,
                             'feature #4': 0.0,
                             'feature #5': 0.10524,
                             'feature #6': 0.44551,
                             'feature #7': 0.0,
                             'feature #8': 0.06101,
                             'feature #9': 0.12202}},
 'SHUFFLE': {'META': {'N_FEATURES': 13,
                      'N_LOW': 6,
                      'N_NON_ZERO': 7,
                      'N_ZERO': 6},
             'SENSITIVITY': {'feature #0'

# Scoring example

In [17]:
pprint(pytrust.scoring_report().simplified_keys())

{'QUALITY': 1.0,
 'recall': {'CI_HIGH': 0.95014, 'CI_LOW': 0.89584, 'SCORE_VALUE': 0.91707}}


# Quality report

In [18]:
pprint(pytrust.quality_report().simplified_keys())

{'test_set': {'overall_quality': 0.9407896889005202,
              'quality_components': {'ci_ratio': 0.9407896889005202,
                                     'separation_quality': 1.0}},
 'train_set': {'overall_quality': 0,
               'quality_components': {'imputation': 0.59553,
                                      'leakage': 0.99033,
                                      'overfit': 0.32062999999999997}}}
