In [1]:
# slient install
!pip install -q -r requirements.txt
!mkdir data

In [2]:
import os, time
from datetime import datetime
from tqdm.notebook import trange, tqdm
import itertools
import requests

import europy
from europy.notebook import load_global_params
from europy.decorator import using_params, bias, data_bias, fairness, accountability, transparency, minimum_functionality, accuracy
from europy.decorator import model_details
from europy.lifecycle import reporting
from europy.lifecycle.reporting import execute_tests, report_model_details, report_model_params, generate_report

import tensorflow as tf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

from sklearn.metrics import confusion_matrix, precision_score, roc_auc_score, auc, roc_curve, classification_report

from transformers import BertTokenizer

from model import BertClassifier
from transformers import TFBertModel

[nltk_data] Downloading package stopwords to /home/b/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Download Data
(this is only needed once)

### Download & Load Model
Previously trained using `toxic_comment_classification.ipynb` --> results uploaded to s3

In [3]:
# download model data
url = 'https://blainerothrock-public.s3.us-east-2.amazonaws.com/europy/toxic_comment_classification_v1.zip'
r = requests.get(url, allow_redirects=True)
open('data/toxic_comment_classification_v1.zip', 'wb').write(r.content)

415792766

In [4]:
# unzip model data
!mkdir data
!unzip data/toxic_comment_classification_v1.zip -d data/
!rm data/toxic_comment_classification_v1.zip

mkdir: cannot create directory ‘data’: File exists
Archive:  data/toxic_comment_classification_v1.zip
   creating: data/toxic_comment_classification_v1/
  inflating: data/toxic_comment_classification_v1/params.yml  
  inflating: data/toxic_comment_classification_v1/submission.csv  
  inflating: data/toxic_comment_classification_v1/model_details.yml  
  inflating: data/toxic_comment_classification_v1/2020-11-23 08:42_bert.h5  
  inflating: data/toxic_comment_classification_v1/model.py  
  inflating: data/toxic_comment_classification_v1/obscene_words.txt  


### Download Jigsaw Dataset
From [Kaggle](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
Requires a Kaggle account and a auth token stored at `~/.kaggle/kaggle.json`. See [Kaggle's API documentation](https://github.com/Kaggle/kaggle-api#api-credentials) for more details.

In [5]:
# download kaggle dataset
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

jigsaw-toxic-comment-classification-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
# unzip kaggle data and delete zip
!unzip -o jigsaw-toxic-comment-classification-challenge.zip -d data/
!rm jigsaw-toxic-comment-classification-challenge.zip
!unzip -o data/sample_submission.csv.zip -d data/
!rm data/sample_submission.csv.zip
!unzip -o data/train.csv.zip -d data/
!rm data/train.csv.zip
!unzip -o data/test.csv.zip -d data/
!rm data/test.csv.zip
!unzip -o data/test_labels.csv.zip -d data/
!rm data/test_labels.csv.zip

Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv.zip       
  inflating: data/test_labels.csv.zip  
  inflating: data/train.csv.zip      
Archive:  data/sample_submission.csv.zip
  inflating: data/sample_submission.csv  
Archive:  data/train.csv.zip
  inflating: data/train.csv          
Archive:  data/test.csv.zip
  inflating: data/test.csv           
Archive:  data/test_labels.csv.zip
  inflating: data/test_labels.csv    


## Set Up
Start here if data is already downloaded

In [7]:
# load global params from the model training
params = load_global_params('data/toxic_comment_classification_v1/params.yml')
model_path = 'data/toxic_comment_classification_v1/2020-11-23 08:42_bert.h5'

  - global.pre_trained_model: bert-base-uncased
  - global.max_seq_len: 128
  - global.train_percent: 0.1
  - global.batch_size: 32
  - global.num_epochs: 1
  - global.label_cols: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  - global.test_size: 0.1
  - global.learning_rate: 2e-05


In [8]:
# create BERT tokenizer from Huggingface
tokenizer = BertTokenizer.from_pretrained(
    params['pre_trained_model'],
    do_lower_case=True
)

In [9]:
# Load the fine-tuned model from downloaded weights

def load_model(weight_path="data/models/2020-11-23 08:42_bert.h5", params=None):
    model = BertClassifier(TFBertModel.from_pretrained(params['pre_trained_model']), len(params['label_cols']))
    encoded = tokenizer("init the model", return_tensors='tf')
    model(**encoded)
    model.load_weights(weight_path)
    return model

model = load_model(model_path, params)
model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Model: "bert_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tf_ber

In [11]:
# load Jigsaw data and submission from model (testing prediction)

submission_df = pd.read_csv('data/toxic_comment_classification_v1/submission.csv')
test_labels = pd.read_csv('data/test_labels.csv')
test_labels_gf = test_labels.copy()

# rename labels to combine with testing predictions
cols_rn = {}
for label in params['label_cols']:
    cols_rn[label] = label+'_gt'
test_labels_gf.rename(columns=cols_rn, inplace=True)

test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')

obscene_words = open("data/toxic_comment_classification_v1/obscene_words.txt", "r").readlines()
obscene_words = [w.strip('\n').lower() for w in obscene_words]


# merge generated submission
# (test test_df contains both prediction and ground truth labels)
test_df = test_df.merge(submission_df, on='id').dropna()
test_df = test_df.merge(test_labels_gf, on='id').dropna()

In [12]:
# initialize the model report with model details and parameters from the downloaded model
report_model_details('data/toxic_comment_classification_v1/model_details.yml')
report_model_params('data/toxic_comment_classification_v1/params.yml')

## Test: Data

### Accurarcy Tests
Check the Precision, Recall and Accurarcy of the model (assuming a 0.5 classification threshold)

In [13]:
@accuracy(
    "Classification Report",
    "Test precision, recall, accurarcy, and f1-score of each class as if >0.5 is considered classified as a label")
def test_classificaiton_reprot(df, labels, pos=0.5):
    results = []
    for l in labels:
        preds = np.array(list(map(lambda x: 1 if x > pos else 0, df[label].to_numpy())))
        gt = np.array(list(map(lambda x: 1 if x == 1.0 else 0, df[l+'_gt'].to_numpy())))

        result = classification_report(gt, preds, labels=[0,1], target_names=[f'not {l}', l], output_dict=True)

        r = {'label': l}
        r['accuracy'] = result['accuracy']
        r['weighted_precision'] = result['weighted avg']['precision']
        r['weighted_recall'] = result['weighted avg']['recall']
        r['weighted_f1_score'] = result['weighted avg']['f1-score']
        r['support_pos'] = result[l]['support']
        r['f1_score_pos'] = result[l]['f1-score']
        r['support_neg'] = result[f'not {l}']['support']
        r['f1_score_neg'] = result[f'not {l}']['f1-score']

        results.append(r)


    return pd.DataFrame.from_dict(results)

@accuracy(
    "ROC AUC", 
    "Area Under the Curve score for each classification (this was the training metric)"
)
def calc_roc_auc(df, labels):
    results = []
    for l in labels:
        preds = df[label].to_numpy()
        gt = df[l+'_gt'].to_numpy()

        fpr, tpr, thresholds = roc_curve(gt, preds, pos_label=1)
        auc_score = auc(fpr, tpr)
        results.append({
            'label': l,
            'auc_score': auc_score
        })
    
    return pd.DataFrame.from_dict(results)

In [14]:
execute_tests(df=test_df, labels=params['label_cols'])


Execute - Classification Report (['accuracy'])
	PASS
Execute - ROC AUC (['accuracy'])
	PASS
Total Tests: 2
Passing: 2
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,Classification Report,"Test precision, recall, accurarcy, and f1-scor...",[accuracy],label accuracy weighted_precision...,[],True
1,ROC AUC,Area Under the Curve score for each classifica...,[accuracy],label auc_score 0 toxic ...,[],True


In [15]:
generate_report(clear_report=False)

Report output: file:///home/b/.europy/reports/EuroPy_Test_Report_06122020_143942/report.md


### Transparency and data bias tests

#### Compare common words from training and Testing Data

In [16]:
def bleep(w, obscene_words):
    if w in obscene_words:
        mask = len(w[1:-1]) * '*'
        word_mask = w[0] + mask + w[-1]
        return word_mask
    return w

In [17]:
def generate_word_freq(df, label, threshold=0.9):
    result = {'label': label}
    word_freq = {}
    
    sps = set(stopwords.words('english'))
    
    sub_df = df.where(df[label] > threshold).dropna()
    
    comment_words = sub_df.comment_text.to_numpy().flatten().tolist()
    comment_words = " ".join(comment_words).split(" ")
    comment_words = [w.lower() for w in comment_words]
    comment_words = list(filter(lambda w: w.isalpha() and len(w) > 2 and w not in sps, comment_words))

    for w in comment_words:
        word_freq[w] = word_freq.get(w, 0) + 1

    sorted_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) 

    for i in range(10):
        result[f'{i}_word'] = sorted_freq[i][0]
        result[f'{i}_count'] = sorted_freq[i][1]

    return result

@transparency(
    "Top 10 Word Counts by Label for testing data", 
    "Comparision of word counts in testing data. Includes top 10 words"
)
def test_data_word_count(df, labels, threshold=0.75):
    results = []
    for l in labels:
        results.append(generate_word_freq(df, l, threshold=threshold))

    return pd.DataFrame.from_dict(results)

In [18]:
execute_tests(df=test_df, labels=params['label_cols'])

Execute - Top 10 Word Counts by Label for testing data (['transparency'])
	PASS
Total Tests: 1
Passing: 1
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,Top 10 Word Counts by Label for testing data,Comparision of word counts in testing data. In...,[transparency],label 0_word 0_count 1_word 1_c...,[],True


In [19]:
@transparency(
    "Top 10 Word Counts by Label for training data", 
    "Comparision of word counts in training data. Includes top 10 words"
)
def test_data_word_count(df, labels, threshold=0.75):
    results = []
    for l in labels:
        results.append(generate_word_freq(df, l, threshold=threshold))

    return pd.DataFrame.from_dict(results)

In [20]:
execute_tests(df=train_df, labels=params['label_cols'])

Execute - Top 10 Word Counts by Label for training data (['transparency'])
	PASS
Total Tests: 1
Passing: 1
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,Top 10 Word Counts by Label for training data,Comparision of word counts in training data. I...,[transparency],label 0_word 0_count 1_word 1_...,[],True


In [21]:
generate_report(clear_report=False)

Report output: file:///home/b/.europy/reports/EuroPy_Test_Report_06122020_143942/report.md


#### Class Distrubutions

In [22]:
def count_classes(df, threshold, label_cols):
    counts = [0] * len(label_cols)
    for i, label in enumerate(label_cols):
        values = df[label].to_list()
        for value in values:
            if value > threshold:
                counts[i] += 1
    return counts
    

def class_dist(train_df, test_df, test_labels_df, label_cols, threshold=0.9, no_class=False, plot_name="Class_Distribution", plots={}):    
    train_counts = count_classes(train_df, threshold, label_cols)
    test_pred_counts = count_classes(test_df, threshold, label_cols)
    test_label_counts = count_classes(test_labels_df, threshold, label_cols)
    
    if no_class:
        label_cols.append('None')
        train_counts.append(train_df.shape[0] - sum(train_counts))
        test_pred_counts.append(test_df.shape[0] - sum(test_pred_counts))
        test_label_counts.append(test_labels_df.shape[0] - sum(test_label_counts))
    
    dist_data = []
    for i, label in enumerate(label_cols):
        dist = {'label': label}
        dist['train_actual'] = train_counts[i]
        dist['test_predictions'] = test_pred_counts[i]
        dist['test_actuals'] = test_label_counts[i]

    x = np.arange(len(label_cols))  # the label locations
    width = 0.2
    
    fig, ax = plt.subplots()
    rects1 = ax.bar(x, train_counts, width, label='Train Actual')
    rects2 = ax.bar(x + width, test_pred_counts, width, label='Test Predictions')
    rects3 = ax.bar(x - width, test_label_counts, width, label='Test Actual')
    
    ax.set_ylabel('Count')
    ax.set_title('Counts by Classification')
    ax.set_xticks(x)
    ax.set_xticklabels(label_cols)
    ax.legend()
    
    fig.tight_layout()
   
    plots[plot_name] = plt
    
    return pd.DataFrame.from_dict(dist_data)

@data_bias(
    "Classification Counts by Training Actuals, Testing Actuals, and Training Predictions",
    "Uses a threshold of 0.75 to determine classification"
)
def class_dist_class_only(train_df, test_df, test_labels, label_cols, plots={}):
    return class_dist(
        train_df, 
        test_df, 
        test_labels, 
        label_cols, 
        threshold=0.75, 
        no_class=False, 
        plot_name="Class_Distribution_Class_Only",
        plots=plots
    )

@data_bias(
    "Classification Counts by Training Actuals, Testing Actuals, and Training Predictions -- Includes No Classification",
    "Uses a threshold of 0.75 to determine classification"
)
def class_dist_including_non_class(train_df, test_df, test_labels, label_cols, plots={}):
    return class_dist(
        train_df,
        test_df,
        test_labels,
        label_cols,
        threshold=0.75,
        no_class=True, 
        plot_name="Class_Distribution_With_Non_Class",
        plots=plots
    )

In [23]:
execute_tests(train_df=train_df, test_df=test_df, test_labels=test_labels, label_cols=params['label_cols'].copy())

Execute - Classification Counts by Training Actuals, Testing Actuals, and Training Predictions (['data-bias'])
	PASS
Execute - Classification Counts by Training Actuals, Testing Actuals, and Training Predictions -- Includes No Classification (['data-bias'])
	PASS
Total Tests: 2
Passing: 2
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,"Classification Counts by Training Actuals, Tes...",Uses a threshold of 0.75 to determine classifi...,[data-bias],Empty DataFrame Columns: [] Index: [],[ReportFigure(\n\ttitle: Class_Distribution_Cl...,True
1,"Classification Counts by Training Actuals, Tes...",Uses a threshold of 0.75 to determine classifi...,[data-bias],Empty DataFrame Columns: [] Index: [],[ReportFigure(\n\ttitle: Class_Distribution_Wi...,True


In [24]:
generate_report(clear_report=False)

Report output: file:///home/b/.europy/reports/EuroPy_Test_Report_06122020_143942/report.md


## Test Minimum Functionality

In [25]:
def test_model(model, phrases, label_cols=params['label_cols']):
    results = []
    
    for phrase in phrases:
        encoded = tokenizer(phrase, return_tensors='tf')
        output = model(**encoded).numpy()[0]
        result = {}
        result["phrase"] = phrase
        for i in range(len(output)):
            result[label_cols[i]] = f"{output[i]:.2f}"
            

        results.append(result)
    
    return pd.DataFrame.from_dict(results, dtype='float')

In [26]:
@minimum_functionality(
    "Easily identifable non-toxic phrases", 
    "Phrases should be very low probability in every category"
)
def non_toxic_identifible_phrases(model):
    nice_phrases = [
        'Hey, it was nice to see you the other day',
        'whoa, I never thought of that',
        'very cool idea',
        'In general I dissagree, but I understand where you are coming from.' ,
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod',
        'quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima'
    ]
    
    return test_model(model, nice_phrases)

In [27]:
@minimum_functionality(
    "Easily Identify toxic comment phrases",
    "Phrases pulled from the tests set."
)
def toxic_comment_identifible_phrases(model):
    phrases = [
        '== Arabs are committing genocide in Iraq, but no protests in Europe. == \n\n May Europe also burn in hell.',
        'HE IS A GHAY ASS FUCKER@@!!', 'Simple: You are stupid!',
        'MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MOVIES. HE HAS SO MUCH BUTTSEX THAT HIS ASSHOLE IS NOW BIG ENOUGH TO BE CONSIDERED A COUNTRY.',
        'Yo bitch Ja Rule is more succesful then you\'ll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,',
        '== wat the heck? == \n\n what the hell is wrong with you? \n\n Next time you vandalize my page i\'m going over there to Fayetteville, Arkansas and beating the crap out of you!',
        'If ya not... still fu*k u',
        ':Eek, but shes cute in an earthy kind of way. Cant sing for shit though. Thanks for giving me an unhappy memory.'
    ]

    return test_model(model, phrases)

In [28]:
@minimum_functionality(
    "Model results to subtle put downs", 
    "Identified by Erin Leonard Ph.D. ([Psychology Today](https://www.psychologytoday.com/us/blog/peaceful-parenting/201902/backhanded-comments-digs-and-subtle-put-downs))"
)
def test_subtle_put_downs(model):
    phrases = [
        'You\'d be such a great supervisor if you only knew how to be strong with people.',
        'Why are you so quiet? You need to wake up.',
        'If you had more heart, you’d be a great runner.',
        'You’d be so pretty if you grew out your hair.',
        'I’m so sorry to hear about your divorce. Your poor kids.'
    ]
    
    return test_model(model, phrases)

## Bias Tests

In [29]:
@bias(
    "Toxic Phrases in Spanish",
    "Same phrases from `Easily Identify toxic comment phrases`, translated to Spanish using Google Translate."
)
def toxic_comment_identifible_phrases_spanish(model):
    phrases = [
        '== Los árabes están cometiendo genocidio en Irak, pero no hay protestas en Europa. == \ n \ n Que Europa también arda en el infierno',
        '¡ES UN FUCKER GHAY ASS @ @ !! ',' Simple: ¡Eres estúpida!',
        'MEL GIBSON ES UNA PERRA NAZI QUE HACE PELÍCULAS DE MIERDA. TIENE TANTO BUTTSEX QUE SU CULO AHORA ES LO SUFICIENTE GRANDE PARA SER CONSIDERADO UN PAÍS.',
        'Tu puta Ja Rule es más exitosa de lo que nunca estarás contigo y odiándote, mofuckas tristes ... Debería abofetear tus rostros pálidos y piadosos y hacer que me beses el trasero, chicos, me enferman. Ja rule se trata de orgullo por el músico. no le digas esa mierda. y no hay nada malo en ser como Tupac, él también era un hermano ... jodidos chicos blancos hacen las cosas bien la próxima vez.',
        '== ¿qué diablos? == \ n \ n ¿qué diablos te pasa? \ n \ n ¡La próxima vez que destroces mi página, iré a Fayetteville, Arkansas y te daré una paliza!'
    ]

    return test_model(model, phrases)

In [30]:
@bias(
    "Context use of the word 'bitch'", 
    "Depending on context and who is using the term, 'bitch can be offensive to empowering' -- examples from [reallifeglobal.com](https://reallifeglobal.com/btch-please-how-to-use-the-word-bitch-correctly/)"
)
def contextual_toxicity(model):
    phrases = [
        "My boss bitched me out for being 10 minutes late.",
        "that party was bitchin",
        "Calculus is a bitch",
        "Monday morning is a bitch",
        "I do everything for my boss. My friends at work tell me that I'm his bitch",
        "12 hours in the car today and my friends made me ride bitch that whole time.",
        'You’re being really bitchy today. Why can’t you be nicer to me?',
        'I wish people would just stop bitching and moaning about corruption in politics and do something about it.',
        "I don’t know what’s wrong with her. She’s being a bitch"
    ]

    uses = [
        'to bitch someout out (generally not offensive)',
        'Bitchin’ as an Adjective',
        'Life’s a Bitch',
        'Life’s a Bitch',
        'calling a man a bitch',
        'to ride bitch in a car (middle back-seat)',
        'Bitchy/ing as an Adjective',
        'Bitchy/ing as an Adjective',
        'calling a women a bitch: annoying/unpleasent'
    ]

    results = test_model(model, phrases)
    results['use'] = uses
    
    return results

In [31]:
execute_tests(model=model)

Execute - Easily identifable non-toxic phrases (['minimum-functionality'])
	PASS
Execute - Easily Identify toxic comment phrases (['minimum-functionality'])
	PASS
Execute - Model results to subtle put downs (['minimum-functionality'])
	PASS
Execute - Toxic Phrases in Spanish (['bias'])
	PASS
Execute - Context use of the word 'bitch' (['bias'])
	PASS
Total Tests: 5
Passing: 5
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,Easily identifable non-toxic phrases,Phrases should be very low probability in ever...,[minimum-functionality],...,[],True
1,Easily Identify toxic comment phrases,Phrases pulled from the tests set.,[minimum-functionality],...,[],True
2,Model results to subtle put downs,Identified by Erin Leonard Ph.D. ([Psychology ...,[minimum-functionality],...,[],True
3,Toxic Phrases in Spanish,Same phrases from `Easily Identify toxic comme...,[bias],...,[],True
4,Context use of the word 'bitch',Depending on context and who is using the term...,[bias],...,[],True


In [32]:
generate_report(clear_report=False)

Report output: file:///home/b/.europy/reports/EuroPy_Test_Report_06122020_143942/report.md


### Find Words that Flip Any Sentence

In [33]:
def flip_words(model, base_phrase, threshold=0.5, label_cols=params['label_cols']):
    from random import randint
    phrases = []
    
    for word in obscene_words:
        base_phrase_split = base_phrase.split(" ")
        base_phrase_split.insert(randint(0, len(base_phrase_split)), word)
        phrases.append(" ".join(base_phrase_split))
    
    # phrases.insert(0, base_phrase)
    
    flip_words = [{'label': label, 'words': []} for label in label_cols]

    results = test_model(model, phrases)
    for i, row in results.iterrows():
        record = {}
        obscene_word = obscene_words[i]
        
        for i, label in enumerate(label_cols):
            if row[label] >= threshold:
                flip_words[i]['words'].append(obscene_word)
    
    return flip_words

In [34]:
@transparency(
    "Always Toxic words", 
    "identify a set of words that will always flip a non-toxic comment, by class. Classification is consider >0.5. The Latin phrase `Lorem ipsum dolor sit amet. consectetur adipiscing elit, sed do eiusmod`"
)
@bias()
def find_flip_words(model, phrases, label_cols):
    all_words = [{'label': label, 'words': []} for label in label_cols]

    for phrase in phrases:
        
        for i in range(len(phrase.split(" "))):
            words_dict = flip_words(model, phrase)
            for i, label in enumerate(label_cols):
                all_words[i]['words'].append(words_dict[i]['words'])

    
    for i, label in enumerate(label_cols):
        all_words[i]['words'] = set.intersection(*map(set,all_words[i]['words']))
    
    return pd.DataFrame.from_dict(all_words)

In [35]:
execute_tests(model=model, phrases=["Lorem ipsum dolor sit amet. consectetur adipiscing elit, sed do eiusmod"], label_cols=params['label_cols'])

Execute - Always Toxic words (['bias', 'transparency'])
	PASS
Total Tests: 1
Passing: 1
Failing: 0


Unnamed: 0,key,description,labels,result,figures,success
0,Always Toxic words,identify a set of words that will always flip ...,"[bias, transparency]",label ...,[],True


In [36]:
generate_report(clear_report=False)

Report output: file:///home/b/.europy/reports/EuroPy_Test_Report_06122020_143942/report.md
