In [3]:
from detoxify import Detoxify
import pandas as pd

# each model takes in either a string or a list of strings

results = Detoxify('original').predict('example text')

results = Detoxify('unbiased').predict(['example text 1','example text 2'])


In [6]:
Detoxify('original').predict('example text')

{'toxicity': 0.00064783124,
 'severe_toxicity': 0.0001209842,
 'obscene': 0.00018694326,
 'threat': 0.000116240895,
 'insult': 0.0001811189,
 'identity_attack': 0.00014001914}

In [5]:
Detoxify('unbiased').predict(['example text 1','example text 2'])

{'toxicity': [0.00041021101060323417, 0.0004120281373616308],
 'severe_toxicity': [1.5820104408703628e-06, 1.5051891750772484e-06],
 'obscene': [2.8287167879170738e-05, 2.6851148504647426e-05],
 'identity_attack': [7.005838415352628e-05, 7.465355156455189e-05],
 'insult': [8.426110434811562e-05, 8.405766129726544e-05],
 'threat': [2.3083515770849772e-05, 2.2790431103203446e-05],
 'sexual_explicit': [1.4899213056196459e-05, 1.3704860975849442e-05]}

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_NAME = 'unitary/unbiased-toxic-roberta' # change this to your model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [34]:
text = 'I think this problem should be solved now! there is no reason to keep it open any longer. I do not agree with your reasoning.'

inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Apply threshold
threshold = 0.5
predictions = (probabilities > threshold).int()

# Get labels
predicted_labels = [model.config.id2label[i] for i, predicted in enumerate(predictions[0]) if predicted == 1]
predicted_labels

[]

In [30]:
model, model.config.id2label

(RobertaForSequenceClassification(
   (roberta): RobertaModel(
     (embeddings): RobertaEmbeddings(
       (word_embeddings): Embedding(50265, 768, padding_idx=1)
       (position_embeddings): Embedding(514, 768, padding_idx=1)
       (token_type_embeddings): Embedding(1, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): RobertaEncoder(
       (layer): ModuleList(
         (0-11): 12 x RobertaLayer(
           (attention): RobertaAttention(
             (self): RobertaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): RobertaSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bia

In [32]:
LABELS = model.config.id2label

# Create a list of dictionaries, each containing a label and its corresponding score
results = [{'label': LABELS[i], 'score': prob.item()} for i, prob in enumerate(probabilities[0])]

# Sort the results by score in descending order
results.sort(key=lambda x: x['score'], reverse=True)

# Wrap the results in an additional list to match the desired output
results = [results]

# Print the results
print(results)

[[{'label': 'toxicity', 'score': 0.6187502145767212}, {'label': 'obscene', 'score': 0.3156442940235138}, {'label': 'insult', 'score': 0.06359092146158218}, {'label': 'sexual_explicit', 'score': 0.0010141676757484674}, {'label': 'identity_attack', 'score': 0.00018176126468461007}, {'label': 'male', 'score': 0.00016716725076548755}, {'label': 'threat', 'score': 0.00015431440260726959}, {'label': 'jewish', 'score': 0.00011388254642952234}, {'label': 'homosexual_gay_or_lesbian', 'score': 8.736200106795877e-05}, {'label': 'black', 'score': 8.387407433474436e-05}, {'label': 'psychiatric_or_mental_illness', 'score': 6.513501284644008e-05}, {'label': 'severe_toxicity', 'score': 5.325994425220415e-05}, {'label': 'female', 'score': 3.6531579098664224e-05}, {'label': 'christian', 'score': 2.74693902611034e-05}, {'label': 'white', 'score': 2.1288897187332623e-05}, {'label': 'muslim', 'score': 8.408659596170764e-06}]]
