In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
from model_tool import ToxModel
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import model_tool
import model_bias_analysis

# autoreload makes it easier to interactively work on code in the model_bias_analysis module.
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


HELLO from model_tool


# Intro

This Notebook shows how one can rebalance a dataset and use debiased word embeddings to create fairer classifiers for Toxicity classification. We show that using debiased word embeddings can improve fairness via metrics proposed in http://www.aies-conference.com/wp-content/papers/main/AIES_2018_paper_9.pdf. This Notebook contains code from https://github.com/conversationai/unintended-ml-bias-analysis. And is split up into 3 parts

- Dataset Loading and Model Training
- Evaluate Model Fairness
- Visulize Results

# Dataset Loading and Model Training

In [2]:
SPLITS = ['train', 'dev', 'test']

wiki = {}
debias = {}
random = {}
for split in SPLITS:
    wiki[split] = '../data/toxicity/wiki_%s.csv' % split
    debias[split] = '../data/toxicity/wiki_debias_%s.csv' % split
    random[split] = '../data/toxicity/wiki_debias_random_%s.csv' % split
    
hparams_100 = {
    'max_sequence_length': 250,
    'max_num_words': 10000,
    'embedding_dim': 100,
    'embedding_trainable': False,
    'learning_rate': 0.00005,
    'stop_early': True,
    'es_patience': 1,  # Only relevant if STOP_EARLY = True
    'es_min_delta': 0,  # Only relevant if STOP_EARLY = True
    'batch_size': 128,
    'epochs': 4,
    'dropout_rate': 0.3,
    'cnn_filter_sizes': [128, 128, 128],
    'cnn_kernel_sizes': [5, 5, 5],
    'cnn_pooling_sizes': [5, 5, 40],
    'verbose': True
}

In [3]:
def train_models(name,data,word_embeddings_path,params=[]):
    count = 100
    for i in params:
        model_version = name+"_"+str(count)
        count+=1
        model = ToxModel(hparams=i,embeddings_path = word_embeddings_path)
        print("Training {model_version}")
        model.train(data['train'], data['dev'], text_column = 'comment', label_column = 'is_toxic', model_name = model_version)
        print("Testing Model")
        test = pd.read_csv(data['test'])
        print(model.score_auc(test['comment'], test['is_toxic']))

In [4]:
train_models('wiki_debias_random_cnn_v3',random,'../data/embeddings/glove.6B/glove.6B.100d.txt',[hparams_100]*10)

Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Training model...
Train on 99157 samples, validate on 33283 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.17061, saving model to ../models/wiki_debias_random_cnn_v3_100_model.h5
 - 9s - loss: 0.2315 - acc: 0.9193 - val_loss: 0.1706 - val_acc: 0.9385
Epoch 2/

Epoch 4/4
Epoch 00004: val_loss improved from 0.14428 to 0.12928, saving model to ../models/wiki_debias_random_cnn_v3_104_model.h5
 - 8s - loss: 0.1321 - acc: 0.9516 - val_loss: 0.1293 - val_acc: 0.9516
Model trained!
Best model saved to ../models/wiki_debias_random_cnn_v3_104_model.h5
Loading best model from checkpoint...
Model loaded!
Testing Model
0.9551762651565763
Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Training model...
Train on 99157 samples, validate on 33283 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0

In [5]:
train_models('wiki_cnn_v3',wiki,'../data/embeddings/glove.6B/glove.6B.100d.txt',[hparams_100]*10)

Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Training model...
Train on 95692 samples, validate on 32128 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.17539, saving model to ../models/wiki_cnn_v3_100_model.h5
 - 11s - loss: 0.2432 - acc: 0.9150 - val_loss: 0.1754 - val_acc: 0.9363
Epoch 2/4
Epoch 00002: val_loss improved from 0.17539 to 0.15374, saving model to ../models/wiki_cnn_v3_100_model.h5
 - 8s - loss: 0.1687 - acc: 0.9386 - val_loss: 0.1537 - val_acc: 0.9420
Epoch 3/4
Epoch 00003: val_loss improved from 0.153

Epoch 3/4
Epoch 00003: val_loss improved from 0.15119 to 0.14127, saving model to ../models/wiki_cnn_v3_105_model.h5
 - 8s - loss: 0.1490 - acc: 0.9454 - val_loss: 0.1413 - val_acc: 0.9475
Epoch 4/4
Epoch 00004: val_loss improved from 0.14127 to 0.13298, saving model to ../models/wiki_cnn_v3_105_model.h5
 - 8s - loss: 0.1367 - acc: 0.9491 - val_loss: 0.1330 - val_acc: 0.9505
Model trained!
Best model saved to ../models/wiki_cnn_v3_105_model.h5
Loading best model from checkpoint...
Model loaded!
Testing Model
0.9546075797625027
Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embe

In [8]:
train_models('wiki_debias_cnn_v3',debias,'../data/embeddings/glove.6B/glove.6B.100d.txt',[hparams_100]*10)

Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Training model...
Train on 99157 samples, validate on 33283 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.17303, saving model to ../models/wiki_debias_cnn_v3_100_model.h5
 - 16s - loss: 0.2444 - acc: 0.9142 - val_loss: 0.1730 - val_acc: 0.9374
Epoch 2/4
Epoch 00002: val_loss improved from 0.17303 to 0.14716, saving model to ../models/wiki_debias_cnn_v3_100_model.h5
 - 8s - loss: 0.1653 - acc: 0.9399 - val_loss: 0.1472 - val_acc: 0.9460
Epoch 3/4
Epoch 00003: val_loss impro

 - 19s - loss: 0.2382 - acc: 0.9184 - val_loss: 0.1698 - val_acc: 0.9381
Epoch 2/4
Epoch 00002: val_loss improved from 0.16977 to 0.14791, saving model to ../models/wiki_debias_cnn_v3_105_model.h5
 - 8s - loss: 0.1635 - acc: 0.9411 - val_loss: 0.1479 - val_acc: 0.9453
Epoch 3/4
Epoch 00003: val_loss improved from 0.14791 to 0.13681, saving model to ../models/wiki_debias_cnn_v3_105_model.h5
 - 8s - loss: 0.1453 - acc: 0.9476 - val_loss: 0.1368 - val_acc: 0.9490
Epoch 4/4
Epoch 00004: val_loss improved from 0.13681 to 0.13249, saving model to ../models/wiki_debias_cnn_v3_105_model.h5
 - 8s - loss: 0.1341 - acc: 0.9510 - val_loss: 0.1325 - val_acc: 0.9515
Model trained!
Best model saved to ../models/wiki_debias_cnn_v3_105_model.h5
Loading best model from checkpoint...
Model loaded!
Testing Model
0.9538015699256188
Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [12

In [9]:
train_models('cnn_debias_tox_v3_debiased_WE',debias,'../data/embeddings/glove.6B/glove_debias_toxic_projection.txt',[hparams_100]*10)

Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Training model...
Train on 99157 samples, validate on 33283 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.16905, saving model to ../models/cnn_debias_tox_v3_debiased_WE_100_model.h5
 - 22s - loss: 0.2260 - acc: 0.9208 - val_loss: 0.1690 - val_acc: 0.9392
Epoch 2/4
Epoch 00002: val_loss improved from 0.16905 to 0.14736, saving model to ../models/cnn_debias_tox_v3_debiased_WE_100_model.h5
 - 8s - loss: 0.1614 - acc: 0.9416 - val_loss: 0.1474 - val_acc: 0.9462
Epoch 3/4
Epoch

 - 24s - loss: 0.2344 - acc: 0.9190 - val_loss: 0.1702 - val_acc: 0.9395
Epoch 2/4
Epoch 00002: val_loss improved from 0.17018 to 0.14884, saving model to ../models/cnn_debias_tox_v3_debiased_WE_105_model.h5
 - 8s - loss: 0.1626 - acc: 0.9404 - val_loss: 0.1488 - val_acc: 0.9445
Epoch 3/4
Epoch 00003: val_loss improved from 0.14884 to 0.13668, saving model to ../models/cnn_debias_tox_v3_debiased_WE_105_model.h5
 - 8s - loss: 0.1438 - acc: 0.9470 - val_loss: 0.1367 - val_acc: 0.9493
Epoch 4/4
Epoch 00004: val_loss improved from 0.13668 to 0.13228, saving model to ../models/cnn_debias_tox_v3_debiased_WE_105_model.h5
 - 8s - loss: 0.1326 - acc: 0.9509 - val_loss: 0.1323 - val_acc: 0.9504
Model trained!
Best model saved to ../models/cnn_debias_tox_v3_debiased_WE_105_model.h5
Loading best model from checkpoint...
Model loaded!
Testing Model
0.9535967267741319
Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
bat

In [10]:
train_models('we_wiki_cnn',wiki,'../data/embeddings/glove.6B/glove_debias_toxic_projection.txt',[hparams_100]*10)

Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embeddings loaded!
Building model graph...
Training model...
Train on 95692 samples, validate on 32128 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.17723, saving model to ../models/we_wiki_cnn_100_model.h5
 - 27s - loss: 0.2487 - acc: 0.9129 - val_loss: 0.1772 - val_acc: 0.9358
Epoch 2/4
Epoch 00002: val_loss improved from 0.17723 to 0.15435, saving model to ../models/we_wiki_cnn_100_model.h5
 - 8s - loss: 0.1687 - acc: 0.9381 - val_loss: 0.1544 - val_acc: 0.9437
Epoch 3/4
Epoch 00003: val_loss improved from 0.154

Epoch 3/4
Epoch 00003: val_loss improved from 0.15054 to 0.14311, saving model to ../models/we_wiki_cnn_105_model.h5
 - 8s - loss: 0.1477 - acc: 0.9450 - val_loss: 0.1431 - val_acc: 0.9450
Epoch 4/4
Epoch 00004: val_loss improved from 0.14311 to 0.13481, saving model to ../models/we_wiki_cnn_105_model.h5
 - 8s - loss: 0.1373 - acc: 0.9493 - val_loss: 0.1348 - val_acc: 0.9495
Model trained!
Best model saved to ../models/we_wiki_cnn_105_model.h5
Loading best model from checkpoint...
Model loaded!
Testing Model
0.9520576255653864
Hyperparameters
---------------
dropout_rate: 0.3
verbose: True
cnn_pooling_sizes: [5, 5, 40]
learning_rate: 5e-05
es_patience: 1
batch_size: 128
epochs: 4
cnn_filter_sizes: [128, 128, 128]
max_sequence_length: 250
stop_early: True
embedding_trainable: False
max_num_words: 10000
es_min_delta: 0
embedding_dim: 100
cnn_kernel_sizes: [5, 5, 5]

Training {model_version}
Fitting tokenizer...
Tokenizer fitted!
Preparing data...
Data prepared!
Loading embeddings...
Embe

# Evaluate Model Fairness

In [None]:
%%time

MODEL_DIR = '../models/'

wiki_model_names = ['wiki_cnn_v3_{}'.format(i) for i in xrange(100, 110)]
wiki_models = [model_tool.ToxModel(name,model_dir=MODEL_DIR) for name in wiki_model_names]

# random_model_names = ['wiki_debias_random_cnn_v3_{}'.format(i) for i in xrange(100, 110)]
# random_models = [model_tool.ToxModel(name,model_dir=MODEL_DIR) for name in random_model_names]

# debias_model_names = ['wiki_debias_cnn_v3_{}'.format(i) for i in xrange(100, 110)]
# debias_models = [model_tool.ToxModel(name,model_dir=MODEL_DIR) for name in debias_model_names]

# we_debias_model_names = ['cnn_debias_tox_v3_debiased_WE_{}'.format(i) for i in xrange(100, 110)]
# we_debias_models = [model_tool.ToxModel(name,embeddings_path='./glove_debias_new_toxic.txt',model_dir=MODEL_DIR) for name in we_debias_model_names]

# we_wiki_model_names = ['we_wiki_cnn_{}'.format(i) for i in xrange(100, 110)]
# we_wiki_models = [model_tool.ToxModel(name,embeddings_path='./glove_debias_new_toxic.txt',model_dir=MODEL_DIR) for name in we_wiki_model_names]


In [None]:
all_model_families = [wiki_models, random_models, debias_models,we_debias_models,we_wiki_models]
all_model_families_names = [wiki_model_names, random_model_names, debias_model_names,we_debias_model_names,we_wiki_model_names]

In [None]:
all_models = wiki_models + random_models + debias_models+ we_debias_models+we_wiki_models

In [None]:
os.remove('../data/toxicity/eval_datasets/bias_madlibs_77k_scored.csv')
madlibs = model_tool.load_maybe_score(
    all_models,
    orig_path='../data/toxicity/eval_datasets/bias_madlibs_77k.csv',
    scored_path='../data/toxicity/eval_datasets/bias_madlibs_77k_scored.csv',
    postprocess_fn=model_tool.postprocess_madlibs,
    )

In [None]:
os.remove('../data/toxicity/wiki_test_scored.csv')
wiki_test = model_tool.load_maybe_score(
    all_models,
    orig_path='../data/toxicity/wiki_test.csv',
    scored_path='../data/toxicity/wiki_test_scored.csv',
    postprocess_fn=model_tool.postprocess_wiki_dataset)

In [None]:
os.remove('../data/toxicity/wiki_debias_test_scored_newest.csv')
os.remove('../data/toxicity/wiki_debias_random_test_scored_newest.csv')

wiki_debias_test = model_tool.load_maybe_score(
    all_models,
    orig_path='../data/toxicity/wiki_debias_test.csv',
    scored_path='../data/toxicity/wiki_debias_test_scored_newest.csv',
    postprocess_fn=model_tool.postprocess_wiki_dataset)

wiki_random_test = model_tool.load_maybe_score(
    all_models,
    orig_path='../data/toxicity/wiki_debias_random_test.csv',
    scored_path='../data/toxicity/wiki_debias_random_test_scored_newest.csv',
    postprocess_fn=model_tool.postprocess_wiki_dataset)

# Visualize Results

## AUC

In [None]:
print('original models:')
model_bias_analysis.plot_model_family_auc(madlibs, wiki_model_names, 'label')
print('\n\nrandom models:')
model_bias_analysis.plot_model_family_auc(madlibs, random_model_names, 'label')
print('\n\ndebias models:')
model_bias_analysis.plot_model_family_auc(madlibs, debias_model_names, 'label');
print('\n\nwe debias models:')
model_bias_analysis.plot_model_family_auc(madlibs, we_debias_model_names, 'label');
print('\n\nwe wiki models:')
model_bias_analysis.plot_model_family_auc(madlibs, we_wiki_model_names, 'label');

In [None]:
all_model_families_names

In [None]:
for dataset_name, dataset in [('orig test', wiki_test),
                              ('debias test', wiki_debias_test),
                              ('random test', wiki_random_test),
                              ('madlibs', madlibs)]:
    print('\n\nAUCs on', dataset_name)
    for model_family in all_model_families_names:
        fam_name = model_bias_analysis.model_family_name(model_family)
        fam_auc = model_bias_analysis.model_family_auc(dataset, model_family, 'label')
        print('{:30s}  mean {:.4f}\t median {:.4f}\t stddev {:.4f}'.format(fam_name, fam_auc['mean'], fam_auc['median'], fam_auc['std']))

## Per-term pinned AUC

Per-term pinned AUC values show improved scores and less disaprity for the debiased model.

In [None]:
madlibs_terms = model_bias_analysis.read_identity_terms('../data/toxicity/bias_madlibs_data/adjectives_people.txt')

In [None]:
model_bias_analysis.add_subgroup_columns_from_text(madlibs, 'text', madlibs_terms)

In [None]:
_raw_term_madlibs_aucs = model_bias_analysis.per_subgroup_aucs(madlibs, madlibs_terms, all_model_families_names, 'label')

In [None]:
orig_mean = 'wiki_cnn_v3_10_mean'
random_mean = 'wiki_debias_random_cnn_v3_10_mean'
debias_mean= 'wiki_debias_cnn_v3_10_mean'
we_debias_mean = 'cnn_debias_tox_v3_debiased_WE_10_mean'
we_wiki_mean = 'we_wiki_cnn_10_mean'


for mean_col in [orig_mean, random_mean, debias_mean,we_debias_mean,we_wiki_mean]:
    print('per-term AUC histogram: mean AUCs across terms for:', mean_col)
    _raw_term_madlibs_aucs[mean_col].hist()
    plt.gca().set_xlim((0.85, 1.0))
    plt.show()

In [None]:
execfile('model_bias_analysis.py')

In [None]:
orig_aucs = 'wiki_cnn_v3_10_aucs'
random_aucs = 'wiki_debias_random_cnn_v3_10_aucs'
debias_aucs= 'wiki_debias_cnn_v3_10_aucs'
we_debias_aucs = 'cnn_debias_tox_v3_debiased_WE_10_aucs'
we_wiki_aucs = 'we_wiki_cnn_10_aucs'
for title, auc_collection_col in [('original model',orig_aucs), ('random treatment',random_aucs), ('debiased dataset treatment',debias_aucs),('debiased dataset and word embedding treatment',we_debias_aucs),('debiased word embedding treatment',we_wiki_aucs)]:
    print(auc_collection_col)
    model_bias_analysis.per_subgroup_scatterplots(
        _raw_term_madlibs_aucs, 'subgroup', auc_collection_col, title='Per-term AUC distributions for ' + title,
        file_name='madlibs',y_lim=(0.85, .99))

### Per-term  AUCs on the wikipedia dataset

In [None]:
model_bias_analysis.add_subgroup_columns_from_text(wiki_test, 'text', madlibs_terms)

In [None]:
%%time
_raw_term_wiki_aucs = model_bias_analysis.per_subgroup_aucs(wiki_test, madlibs_terms, all_model_families_names, 'label')

We don't see a strong effect in per-term pinned AUCs on the wikipedia test set. This isn't surprising, as the dataset bias affects the test set too: there are few non-toxic examples using the bias terms, and so the AUC of the original models does not suffer.

In [None]:
# AUC on tiny datasets is very noisy.
_term_wiki_aucs_nottiny = _raw_term_wiki_aucs[_raw_term_wiki_aucs['subset_size'] > 50]

for title, auc_collection_col in [('original',orig_aucs), ('random',random_aucs), ('debias',debias_aucs),('we_debias',we_debias_aucs),('we_wiki',we_wiki_aucs)]:
    model_bias_analysis.per_subgroup_scatterplots(_term_wiki_aucs_nottiny, 'subgroup', auc_collection_col,
                                              title='Per-term AUC distributions for ' + title, y_lim=(0.8, 1.02),
                                              file_name='wiki')

## Qualitative look

In [None]:
# TODO(jetpack): add column to bias madlibs dataset to allow selecting by template
# to more easily observe discrepancies in scores for the same template.

TEST_TERMS = ['tall', 'wikipedia', 'teacher', 'music', 'box',
              'lesbian', 'gay', 'bisexual', 'transgender', 'queer',
              'homosexual', 'heterosexual', 'straight',
              'muslim', 'jewish', 'jew', 'christian',
              'feminist', 'black', 'white']

def _print_template_scores(models):
    texts = ['i am a {} person'.format(term) for term in TEST_TERMS]
    median_scores = [{'median': np.median([model.predict([text]) for model in models]),
                      'stddev': np.std([model.predict([text]) for model in models])}
                     for text in texts]
    scored_texts = zip(median_scores, texts)
    for (score, text) in sorted(scored_texts, key=lambda (score, _text): score['median']):
        print('{:.3f}  (std: {:.4f})  {}'.format(score['median'], score['stddev'], text))
    plt.hist([x['median'] for x in median_scores])
    plt.gca().set_xlim([0, 1])
    plt.gca().set_ylim([0, 10])

In [None]:
_print_template_scores(wiki_models)

In [None]:
_print_template_scores(random_models)

In [None]:
_print_template_scores(debias_models)

### Equality of Opportunity

To evaluate equality of opportunity, we compare false negative rates on a per-term basis.

In [None]:
def neg_table_means(term_neg_rates):
    """Helper to display just the mean values of the negative rates."""
    return (term_neg_rates
            [['subgroup',
              'orig_fnr_mean',
              'random_fnr_mean',
              'debias_fnr_mean',
              'orig_tnr_mean',
              'random_tnr_mean',
              'debias_tnr_mean',
            ]]
            .sort_values('orig_fnr_mean')
           )

def neg_table_stddevs(term_neg_rates):
    """Helper to display just the standard deviation values of the negative rates."""
    return (term_neg_rates
            [['subgroup',
              'orig_tnr_std',
              'random_tnr_std',
              'debias_tnr_std',
              'orig_tnr_std',
              'random_tnr_std',
              'debias_tnr_std',
              'orig_fnr_mean',  # just for sorting
            ]]
            .sort_values('orig_fnr_mean')
            .drop('orig_fnr_mean', axis=1)
           )

### Threshold 0.5

In [None]:
%%time

# threshold = .50 

_raw_term_neg_rates_50 = model_bias_analysis.per_subgroup_negative_rates(madlibs, madlibs_terms, all_model_families_names, 0.5, 'label')

### Threshold using per-model equal error rate on overall madlibs dataset

In [None]:
%%time

# Compute the equal error rate for all models on the overall madlibs dataset in order to
# compute the false/true negative rates table at the EER for each model.

# Flattened list of all models.
_all_model_names = []
for model_family_names in all_model_families_names:
    _all_model_names.extend(model_family_names)
_model_eers_madlibs = model_bias_analysis.per_model_eer(madlibs, 'label', _all_model_names)

In [None]:
%%time
_raw_term_neg_rates_madlibs_eer = model_bias_analysis.per_subgroup_negative_rates(
    madlibs, madlibs_terms, all_model_families_names, _model_eers_madlibs, 'label')

In [None]:
_raw_term_neg_rates_madlibs_eer.columns

In [None]:
# True negative rates: TN / (TN + FP)

_raw_term_neg_rates_madlibs_eer_sorted = _raw_term_neg_rates_madlibs_eer.sort_values('wiki_debias_cnn_v3_10_tnr_values')
orig_tnr = 'wiki_cnn_v3_10_tnr_values'
random_tnr = 'wiki_debias_random_cnn_v3_10_tnr_values'
debias_tnr= 'wiki_debias_cnn_v3_10_tnr_values'
we_debias_tnr = 'cnn_debias_tox_v3_debiased_WE_10_tnr_values'
we_wiki_tnr = 'we_wiki_cnn_10_tnr_values'
for title, tnr_values_col in [('original',orig_tnr), ('random',random_tnr), ('debias',debias_tnr),('we_debias',we_debias_tnr),('we_wiki',we_wiki_tnr)]:
    model_bias_analysis.per_subgroup_scatterplots(
        _raw_term_neg_rates_madlibs_eer_sorted, 'subgroup', tnr_values_col, y_lim=(0, 1.02),
        title='Per-term true negative rates for ' + title, file_name='madlibs')

In [None]:
# False positive rates: 1 - TNR. This is just the above graphs except flipped to show false positives instead of true negatives.

# _term_neg_rates_madlibs_eer_tnr_sorted = _term_neg_rates_madlibs_eer.sort_values('orig_tnr_mean')

for title, tnr_values_col in [('original model',orig_tnr), ('random treatment',random_tnr), ('debiased dataset treatment',debias_tnr),('debiased dataset and word embeddings treatment',we_debias_tnr),('debiased word embeddings treatment',we_wiki_tnr)]:
    term_fpr_values = []
    for _i, row in _raw_term_neg_rates_madlibs_eer_sorted.iterrows():
        tnr_values = row[tnr_values_col]
        fpr_values = [1 - tnr for tnr in tnr_values]
        term_fpr_values.append({'subgroup': row['subgroup'], 'fpr_values': fpr_values})
    fpr_df = pd.DataFrame(term_fpr_values)
    model_bias_analysis.per_subgroup_scatterplots(
        fpr_df, 'subgroup', 'fpr_values', y_lim=(0, 1.02),
        title='Per-term false positive rates for ' + title,
        file_name='madlibs_' + tnr_values_col)

In [None]:
# False negative rates: FN / (FN + TP). 1 - TPR.

# Should we use the same ordering as the true negative rate plots?
_raw_term_neg_rates_madlibs_eer_sorted = _raw_term_neg_rates_madlibs_eer.sort_values('wiki_debias_cnn_v3_10_fnr_values')
orig_fnr = 'wiki_cnn_v3_10_fnr_values'
random_fnr = 'wiki_debias_random_cnn_v3_10_fnr_values'
debias_fnr= 'wiki_debias_cnn_v3_10_fnr_values'
we_debias_fnr = 'cnn_debias_tox_v3_debiased_WE_10_fnr_values'
we_wiki_fnr = 'we_wiki_cnn_10_fnr_values'
for title, fnr_values_col in [('original model',orig_fnr), ('random treatment',random_fnr), ('debiased dataset treatment',debias_fnr),('debiased dataset and word embedding treatment',we_debias_fnr),('debiased word embedding treatment',we_wiki_fnr)]:
    #_term_neg_rates_madlibs_eer_fnr_sorted
    model_bias_analysis.per_subgroup_scatterplots(
        _raw_term_neg_rates_madlibs_eer_sorted, 'subgroup', fnr_values_col, y_lim=(0, 1.02),
        title='Per-term false negative rates for ' + title,
        file_name='madlibs')

### Threshold using per-model equal error rate on Wikipedia test set

The EERs computed on the wikipedia test set are similar, and so we don't see much difference in the per-term negative rates plots.

In [None]:
# %%time

# # Compute the equal error rate for all models on the wikipedia dataset.

# _model_eers_wiki = model_bias_analysis.per_model_eer(wiki_test, 'label', _all_model_names)

In [None]:
# %%time
# _raw_term_neg_rates_wiki_eer = model_bias_analysis.per_subgroup_negative_rates(
#     madlibs, madlibs_terms, all_model_families_names, _model_eers_wiki, 'label')
# _term_neg_rates_wiki_eer = _raw_term_neg_rates_wiki_eer.rename(columns=column_renamer)

In [None]:
# # True negative rates: TN / (TN + FP)

# _term_neg_rates_wiki_eer_tnr_sorted = _term_neg_rates_wiki_eer.sort_values('orig_tnr_mean')

# for title, tnr_values_col in [('original model', 'orig_tnr_values'),
#                               ('random treatment', 'random_tnr_values'),
#                               ('debiasing treatment', 'debias_tnr_values')]:
#     model_bias_analysis.per_subgroup_scatterplots(
#         _term_neg_rates_wiki_eer_tnr_sorted, 'subgroup', tnr_values_col, y_lim=(0, 1.02),
#         title='Per-term true negative rates for ' + title, file_name='wiki')

In [None]:
# # False negative rates: FN / (FN + TP). 1 - TPR.

# # Should we use the same ordering as the true negative rate plots?
# _term_neg_rates_wiki_eer_fnr_sorted = _term_neg_rates_wiki_eer.sort_values('orig_fnr_mean')

# for title, fnr_values_col in [('original model', 'orig_fnr_values'),
#                               ('random treatment', 'random_fnr_values'),
#                               ('debiasing treatment', 'debias_fnr_values')]:
#     model_bias_analysis.per_subgroup_scatterplots(
#         _term_neg_rates_wiki_eer_fnr_sorted, 'subgroup', fnr_values_col, y_lim=(0, 1.02),
#         title='Per-term false negative rates for ' + title, file_name='wiki')

In [None]:
# all_model_families_names print

In [None]:
model_bias_analysis.per_subgroup_fnr_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, .5, False)

In [None]:
model_bias_analysis.per_subgroup_tnr_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, .5, False)

In [None]:
model_bias_analysis.per_subgroup_auc_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, False)

# new

In [None]:
model_bias_analysis.per_subgroup_auc_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, False)

In [None]:
model_bias_analysis.per_subgroup_fnr_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, _model_eers_madlibs,False)

In [None]:
model_bias_analysis.per_subgroup_tnr_diff_from_overall(madlibs, madlibs_terms, all_model_families_names, _model_eers_madlibs, False)

In [None]:
tp
for index, row in madlibs.iterrows():
    print(row)
    break