# Building a scan programatically

In this notebook we'll build up a scan definition from first principles, against a local model trained within the
notebook.  We will then run that scan and save its results.  Finally we will extract the scan defintion as YAML, which could be used to run the same scan (potentially on revised models or datasets) via the Certifai stand-alone scanner.

MODIFIED A LITTLE AS A TESTBED FOT SEGMENTED MODELS - not illustrative of anything aprt fropm how to go about that

In [1]:
import pandas as pd
import matplotlib as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from copy import copy

from certifai.common.utils import set_verbose
from certifai.scanner.builder import (CertifaiScanBuilder, CertifaiPredictorWrapper, CertifaiModel, CertifaiModelMetric,
                                      CertifaiDataset, CertifaiGroupingFeature, CertifaiDatasetSource,
                                      CertifaiPredictionTask, CertifaiTaskOutcomes, CertifaiOutcomeValue)
from certifai.scanner.report_utils import scores, construct_scores_dataframe

# set verboseness to limit logging
set_verbose(False)

In [2]:
# special import - 
# for multiprocessing to work in a Notebook,  pickled classes must be in a separate package or notebook
# hence, the encoder class has to be somewhere other than the current notebook
# from ipynb.fs.defs.cat_encoder import CatEncoder # <- doesn't work on Azure Notebooks
# %run cat_encoder.py # <- doesn't work because code doesn't remain external

# Azure Notebooks workaround - 
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('.')))
from cat_encoder import CatEncoder

In [3]:
# Example will use a simple logistic classifier on the German Credit dataset
base_path = os.path.join('data')
all_data_file = f"{base_path}/german_credit_eval.csv"

df = pd.read_csv(all_data_file)

cat_columns = [
    'checkingstatus',
    'history',
    'purpose',
    'savings',
    'employ',
    'status',
    'others',
    'property',
    'age',
    'otherplans',
    'housing',
    'job',
    'telephone',
    'foreign'
    ]

label_column = 'outcome'

# SEGMENT MODEL TEST
# Add a randomized column that will select between two 'segment' models on a per-row basis
# Note - we're not really doing any REAL segmentation here - this is just a structural example
# to allow a model to be chosen based on the value of a column - for this test Im just randomly
# setting the selection column which will choose between a logistic and an SVM model
df["model_selector"] = np.random.choice([ "logistic","SVM"], len(df))
cat_columns.append('model_selector')

# Save this so we have a CSV that matches are augmented data and contains a segment label
augmented_data_file = 'augmented_data'
df.to_csv(augmented_data_file, index=False)

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Note - to support python multi-processing in the context of a notebook the encoder MUST
# be in a separate file, which is why `CatEncoder` is defined outside of this notebook
encoder = CatEncoder(cat_columns, X)

def build_model(data, name, model_family, test=None):
    if test is None:
        test = data
        
    if model_family == 'SVM':
        parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[0.1, .5, 1, 2, 4, 10], 'gamma':['auto']}
        m = svm.SVC()
    elif model_family == 'logistic':
        parameters = {'C': (0.5, 1.0, 2.0), 'solver': ['lbfgs'], 'max_iter': [1000]}
        m = LogisticRegression()
    model = GridSearchCV(m, parameters, cv=3)
    model.fit(data[0], data[1])

    # Assess on the test data
    accuracy = model.score(test[0], test[1].values)
    print(f"Model '{name}' accuracy is {accuracy}")
    return model

svm_model = build_model((encoder(X_train.values), y_train),
                        'Support Vector Machine',
                        'SVM',
                        test=(encoder(X_test.values), y_test))

logistic_model = build_model((encoder(X_train.values), y_train),
                        'Logistic classifier',
                        'logistic',
                        test=(encoder(X_test.values), y_test))

Model 'Support Vector Machine' accuracy is 0.76
Model 'Logistic classifier' accuracy is 0.77


In [4]:
# SEGMENT MODEL TEST

# Create a joint model that delegates to the appropriate segment model.  Note - I had to put this into a seperate
# file because of the issue with Python multi-processing and pickling with notebooks

from joint_model import JointModel
segment_selection_model_column_mapping = encoder.cat_indexes_of_feature('model_selector')

# set the respective models to corresponding one-hot encoded segmented feature column
segment_selection_model_column_mapping['model_selector_SVM']['model'] = svm_model
segment_selection_model_column_mapping['model_selector_logistic']['model'] = logistic_model
joint_model = JointModel(segment_selection_model_column_mapping)


# The model's work in the one-hot encoded space tso encode the data
encoded_X = encoder(X.values)

# FOLLOWING is just to test we got it right - not needed for real usage
# Test the predicts match what we expect
# First out joint model that should delegate appropriately
all_joint_predictions = joint_model.predict(encoded_X)

# Now the two segemnt models separately on their segments
logistic_indexes = X['model_selector'] == 'logistic'
logistic_rows = encoded_X[logistic_indexes]
SVM_indexes = X['model_selector'] == 'SVM'
SVM_rows = encoded_X[SVM_indexes]
logistic_predictions = logistic_model.predict(logistic_rows)
SVM_predictions = svm_model.predict(SVM_rows)

# Validate we got the same results
if np.all(logistic_predictions == all_joint_predictions[np.where(logistic_indexes.values)]):
    print("Logistic segment predictions match")
else:
    print("Logistic segment predictions do NOT match!!")
if np.all(SVM_predictions == all_joint_predictions[np.where(SVM_indexes.values)]):
    print("SVM segment predictions match")
else:
    print("SVM segment predictions do NOT match!!")

Logistic segment predictions match
SVM segment predictions match


In [5]:
# Wrap the joint model up for use by Certifai as a local model
joint_model_proxy = CertifaiPredictorWrapper(joint_model, encoder=encoder)

In [None]:
# Create the scan object from scratch using the ScanBuilder class

# First define the possible prediction outcomes
task = CertifaiPredictionTask(CertifaiTaskOutcomes.classification(
    [
        CertifaiOutcomeValue(1, name='Loan granted', favorable=True),
        CertifaiOutcomeValue(2, name='Loan denied')
    ]),
    prediction_description='Determine whether a loan should be granted')

scan = CertifaiScanBuilder.create('test_user_case',
                                  prediction_task=task)

# Add our local models
first_model = CertifaiModel('Joint',
                            local_predictor=joint_model_proxy)
scan.add_model(first_model)

# Add the eval dataset
eval_dataset = CertifaiDataset('evaluation',
                               CertifaiDatasetSource.csv(augmented_data_file))
scan.add_dataset(eval_dataset)

# Setup an evaluation for fairness on the above dataset using the model
# We'll look at disparity between groups defined by marital status and age
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('age'))
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('status'))
scan.add_evaluation_type('fairness')
scan.evaluation_dataset_id = 'evaluation'

# Because the dataset contains a ground truth outcome column which the model does not
# expect to receive as input we need to state that in the dataset schema (since it cannot
# be inferred from the CSV)
scan.dataset_schema.outcome_feature_name = 'outcome'

# Run the scan.
# By default this will write the results into individual report files (one per model and evaluation
# type) in the 'reports' directory relative to the Jupyter root.  This may be disabled by specifying
# `write_reports=False` as below
# The result is a dictionary of dictionaries of reports.  The top level dict key is the evaluation type
# and the second level key is model id.
# Reports saved as JSON (which `write_reports=True` will do) may be visualized in the console app
result = scan.run(write_reports=False)


2020-05-15 18:52:38,028 root   INFO     Validating license...
2020-05-15 18:52:38,029 root   INFO     License is valid - expires: n/a
2020-05-15 18:52:38,048 root   INFO     Generated unique scan id: 5d57ed885339
2020-05-15 18:52:38,050 root   INFO     Validating input data...
2020-05-15 18:52:38,051 root   INFO     Creating dataset with id: evaluation
2020-05-15 18:52:38,079 root   INFO     Inferring dataset features and applying user overrides
2020-05-15 18:52:38,081 root   INFO     Integer-valued feature 'duration' inferred to be numeric (sample cardinality 33)
2020-05-15 18:52:38,085 root   INFO     Integer-valued feature 'amount' inferred to be numeric (sample cardinality 921)
2020-05-15 18:52:38,087 root   INFO     Integer-valued feature 'installment' inferred to be categorical (sample cardinality 4)
2020-05-15 18:52:38,090 root   INFO     Integer-valued feature 'residence' inferred to be categorical (sample cardinality 4)
2020-05-15 18:52:38,099 root   INFO     Integer-valued fe

In [None]:
# The result is a dictionary keyed on analysis, containing reports keyed on model id
# The console app is the recommended way to view these, by saving the results to file
# (see previous cell), but programmatic analysis of the result here is also possible
df = construct_scores_dataframe(scores('fairness', result), include_confidence=False)
display(df)