Copyright (c) 2020. Cognitive Scale Inc. All rights reserved.
Licensed under CognitiveScale Example Code [License](https://github.com/CognitiveScale/cortex-certifai-examples/blob/7998b8a481fccd467463deb1fc46d19622079b0e/LICENSE.md)


# Building a scan programatically for a regression use case

In this notebook we'll go through the end-to-end process of building a local model for a regression use case and running a Certifai Scan said models to analyze Robustness, Fairness, Explainability, Explanation, and Perfomance. The models will be predicting the final settled claim amount for auto insurance claims.

In [1]:
import pandas as pd
import matplotlib as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
import numpy as np
import random
import pprint

from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

from certifai.common.utils.encoding import CatEncoder
from certifai.scanner.builder import (CertifaiScanBuilder, CertifaiPredictorWrapper, CertifaiModel, CertifaiModelMetric,
                                      CertifaiDataset, CertifaiGroupingFeature, CertifaiDatasetSource,
                                      CertifaiPredictionTask, CertifaiTaskOutcomes, CertifaiOutcomeValue)
from certifai.scanner.report_utils import scores, construct_scores_dataframe
from certifai.scanner.explanation_utils import explanations

In [2]:
# Prepare datasets for test/train split
base_path = '..'
all_data_file = f"{base_path}/datasets/auto_insurance_claims_dataset.csv"
explanation_data_file = f"{base_path}/datasets/auto_insurance_explan.csv"
RANDOM_SEED = 42

df = pd.read_csv(all_data_file)

cat_columns = [
    'State Code',
    'Coverage',
    'Education',
    'EmploymentStatus',
    'Gender',
    'Location Code',
    'Marital Status',
    'Policy',
    'Claim Reason',
    'Sales Channel',
    'Vehicle Class',
    'Vehicle Size',
]
label_column = "Total Claim Amount"


Y = df[label_column]
X = df.drop(label_column, axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=RANDOM_SEED)

encoder = CatEncoder(cat_columns, X)

# Train models
linl1_model = Lasso(alpha=1e-4, random_state=RANDOM_SEED)
linl1_model.fit(encoder(X_train.values), Y_train)
linl1_r2_score = r2_score(Y_test, linl1_model.predict(encoder(X_test.values)))
print(linl1_r2_score)

svm_model = LinearSVR(random_state=RANDOM_SEED)
svm_model.fit(encoder(X_train.values), Y_train)
svm_r2_score = r2_score(Y_test, svm_model.predict(encoder(X_test.values)))
print(svm_r2_score)

0.7623698392411863
0.7422361025338485


Objective did not converge. You might want to increase the number of iterations. Duality gap: 9789028.07067509, tolerance: 62512.58675713785


In [3]:
# Wrap the models for use by Certifai as a local model
linl1_model_proxy = CertifaiPredictorWrapper(linl1_model, encoder=encoder)
svm_model_proxy = CertifaiPredictorWrapper(svm_model, encoder=encoder)

In [4]:
# Create the scan object from scratch using the ScanBuilder class

# Here we define the favorable direction to be increasing, and consider
# a change of 0.5 standard deviations to be significant 
task = CertifaiPredictionTask(CertifaiTaskOutcomes.regression(True, 0.5),
                              prediction_description='Amount of Settled Claim')

scan = CertifaiScanBuilder.create('test_user_case',
                                  prediction_task=task)

# Add our local models
first_model = CertifaiModel('LinL1', local_predictor=linl1_model_proxy)
scan.add_model(first_model)

second_model = CertifaiModel('SVM', local_predictor=svm_model_proxy)
scan.add_model(second_model)


# Add datasets to the scan
eval_dataset = CertifaiDataset('evaluation', CertifaiDatasetSource.csv(all_data_file))
scan.add_dataset(eval_dataset)
scan.evaluation_dataset_id = eval_dataset.id

explan_dataset = CertifaiDataset('explanation', CertifaiDatasetSource.csv(explanation_data_file))
scan.add_dataset(explan_dataset)
scan.explanation_dataset_id = explan_dataset.id

# Here we are using a dataframe (from our test/train split) instead of a CSV file for the performance evaluation
test_df = pd.concat([X_test, Y_test], axis=1)
test_dataset = CertifaiDataset('test', CertifaiDatasetSource.dataframe(test_df))
scan.add_dataset(test_dataset)
scan.test_dataset_id = test_dataset.id

# Because the dataset contains a ground truth outcome column which the model does not
# expect to receive as input we need to state that in the dataset schema (since it cannot
# be inferred from the CSV)
scan.dataset_schema.outcome_feature_name = 'Total Claim Amount'


# Setup an evaluation that includes Robustness, Explainability, Explanations, Fairness and Performance
scan.add_evaluation_type('robustness')
scan.add_evaluation_type('explainability')
scan.add_evaluation_type('explanation')

# We'll look at disparity between groups defined by Marital Status and Gender in the fairness evaluation
scan.add_evaluation_type('fairness')
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('Marital Status'))
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('Gender'))

# We'll compute the r-squared metric in the performance evaluation and in doing so
# verify that it matches the value computed earlier during testing
scan.add_evaluation_type('performance')
scan.add_metric(CertifaiModelMetric('r2', 'r-squared'))
scan.atx_performance_metric = 'r2'


# Set the output path that reports should be written to
scan.output_path = '../local_reports'

In [5]:
# Run a preflight scan
# The preflight scan will run various checks against each model to alert about possible situations
# that may impact your scan, as well as produce a time estimate for your scan. A preflight report
# will be written per model that contains various data from the preflight scan, which will be
# used to offer time estimates when running future scans.
preflight_result = scan.run_preflight()

pprint.pprint(preflight_result, width=100)

Starting Preflight Scan
[--------------------] 2020-08-31 13:34:46.338564 - 0 of 8 checks (0.0% complete) - Running model nondeterminism preflight check for model LinL1
[##------------------] 2020-08-31 13:34:46.390290 - 1 of 8 checks (12.5% complete) - Running unknown outcome class preflight check for model LinL1
[#####---------------] 2020-08-31 13:34:46.390466 - 2 of 8 checks (25.0% complete) - Running scan time estimate preflight check for model LinL1
[#######-------------] 2020-08-31 13:35:30.132985 - 3 of 8 checks (37.5% complete) - Running fairness class samples preflight check for model LinL1
[##########----------] 2020-08-31 13:35:30.147065 - 4 of 8 checks (50.0% complete) - Finished all preflight checks for model LinL1
[##########----------] 2020-08-31 13:35:30.147242 - 4 of 8 checks (50.0% complete) - Running model nondeterminism preflight check for model SVM
[############--------] 2020-08-31 13:35:30.201300 - 5 of 8 checks (62.5% complete) - Running unknown outcome class pr

In [6]:
# Run the scan.
# By default this will write the results into individual report files (one per model and evaluation
# type) in the 'local_reports' directory relative to this notebook. This may be disabled by specifying
# `write_reports=False` as below
# The result is a dictionary of dictionaries of reports.  The top level dict key is the evaluation type
# and the second level key is model id.
result = scan.run(write_reports=False)

Starting scan with model_use_case_id: 'test_user_case' and scan_id: 'a51d36316a84', total estimated time is 10 minutes
[--------------------] 2020-08-31 13:36:14.593370 - 0 of 10 reports (0.0% complete) - Running robustness evaluation for model: LinL1, estimated time is 53 seconds
[##------------------] 2020-08-31 13:37:03.426579 - 1 of 10 reports (10.0% complete) - Running explainability evaluation for model: LinL1, estimated time is 53 seconds
[####----------------] 2020-08-31 13:38:42.393185 - 2 of 10 reports (20.0% complete) - Running explanation evaluation for model: LinL1, estimated time is 44 seconds
[######--------------] 2020-08-31 13:39:59.921306 - 3 of 10 reports (30.0% complete) - Running fairness evaluation for model: LinL1, estimated time is 139 seconds
[########------------] 2020-08-31 13:42:14.903681 - 4 of 10 reports (40.0% complete) - Running performance evaluation for model: LinL1, estimated time is 10 seconds
[##########----------] 2020-08-31 13:42:14.915923 - 5 of 

In [7]:
# Display the scores for each model and evaluation type, as well as the overall ATX score
atx_df = construct_scores_dataframe(scores("atx", result))
display(atx_df)

Unnamed: 0,context,ATX,explainability,fairness,performance,robustness
LinL1,LinL1,90.461219,99.430894,97.114499,76.236984,89.0625
SVM,SVM,89.181263,98.693878,97.088815,74.22361,86.71875


In [8]:
# Display the results of the performance evaluation for both models
robustness_df = construct_scores_dataframe(scores("robustness", result))
display(robustness_df)

Unnamed: 0,context,robustness,robustness lower bound,robustness upper bound
LinL1,LinL1,89.0625,87.65625,90.46875
SVM,SVM,86.71875,85.3125,88.125


In [9]:
# Display the results of the explainability evaluation for both models
# The table displays the distributions over the number of features that must be changed to alter predictions
explainability_df = construct_scores_dataframe(scores("explainability", result))
display(explainability_df)

Unnamed: 0,context,explainability,Num features (1),Num features (10),Num features (2),Num features (3),Num features (4),Num features (5),Num features (6),Num features (7),Num features (8),Num features (9)
LinL1,LinL1,99.430894,97.96748,0.406504,1.626016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SVM,SVM,98.693878,86.938776,0.0,13.061224,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Display the fairness results of fairness evaluation for both models
# The table displays the fairness scores across different groups within the "Marital Status" and "Gender" features. 
# You can set `include_confidence=True` to include confidence intervals for each burden value
fairness_df = construct_scores_dataframe(scores("fairness", result), include_confidence=False)
display(fairness_df)

Unnamed: 0,context,type,overall fairness,Feature (Gender),Group details (F),Group details (M),Feature (Marital Status),Group details (Divorced),Group details (Married),Group details (Single)
LinL1 (burden),LinL1,burden,97.114499,97.963882,0.170481,0.16367,97.740154,0.170462,0.161783,0.168504
SVM (burden),SVM,burden,97.088815,97.40387,0.175995,0.167114,98.129803,0.17506,0.168074,0.171106


In [11]:
# Here we will display the r-squared value computed during the performance evaluation
print('LinL1 r2_score: ', result['performance']['LinL1']['performance_metrics'][0]['value'])
print('SVM r2_score: ', result['performance']['SVM']['performance_metrics'][0]['value'])

LinL1 r2_score:  0.7623698392411863
SVM r2_score:  0.7422361025338485


In [12]:
# Using Certifai's explanation utilities we can programatically explore counterfactuals produced
# during the explanation evaluation. Here we will, just look at a single observation
linl1_explanations = explanations(result, 'LinL1')
svm_explanations = explanations(result, 'SVM')


first = linl1_explanations['LinL1'][0]
print('Original input:', first.instance)
print()

# Note, that for this regression use case a counterfactual will produced in the
# favorable direction (increasing) and in the unfavorable direction (decreasing)
for cf in first.explanation.best_individuals:
    print(cf.counterfactual_type)
    print(cf.data)
    print()

Original input: ['NE' 387.3647046 'Basic' 'College' 'Retired' 'M' 28142 'Suburban'
 'Married' 105 18 50 '0' '1' 'Personal L3' 'Hail' 'Agent' 'Sports Car'
 'Medsize']

prediction increased
['NE' 387.3647046 'Basic' 'College' 'Retired' 'M' 28142 'Suburban'
 'Married' 128 18 50 '0' '1' 'Personal L3' 'Hail' 'Agent' 'Sports Car'
 'Medsize']

prediction decreased
['NE' 387.3647046 'Basic' 'College' 'Retired' 'M' 28142 'Rural' 'Married'
 105 18 50 '0' '1' 'Personal L3' 'Hail' 'Agent' 'Sports Car' 'Medsize']

