In [1]:
import pandas as pd


df_muslim = pd.read_csv("data/toxicbias_train.csv")
df_muslim = df_muslim[df_muslim['rationale'] == 'prejudice against muslims']

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from src.Helpers import *
import torch

DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

import pandas as pd
import torch
import time


def standardise_results(results):
    label_mapping = {'contradiction': results['contradiction'],
                     'neutral': results['neutral'],
                     'entailment': results['entailment']}
    return label_mapping


def convert_probabilities(probabilities, label_mapping):
    probabilities_list = probabilities.tolist()[0]  # Convert the tensor to a list and extract the first (and only) batch
    return {name: round(float(pred) * 100, 1) for pred, name in zip(probabilities_list, label_mapping)}



def get_random_samples(csv_filename, num_samples):
    df = pd.read_csv(csv_filename)
    random_samples = df.sample(n=num_samples)
    return random_samples


def benchmark_test(premise, hypothesis, model, num_runs):
    # Setup device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    results = []
    for _ in range(num_runs):
        start_time = time.time()
        _, _ = model(premise, hypothesis)
        execution_time = time.time() - start_time
        results.append(execution_time)

    avg_execution_time = sum(results) / len(results)
    return device, avg_execution_time


In [92]:
premises = df_muslim.head(10)['comment_text'].values
premises = [str(p) for p in premises]

hypotheses = [
  'The use of violence is encouraged or desired against a religion.',
  'A religion should be discriminated against.',
  'Members of a religion should be excluded and marginalized.',
  'Members of a religion should remain in their respective countries.',
  'Members of a religion are compared to animals.',
  'A religion or members of a religion are compared to devils or demons.',
  'The integration of a religion should be avoided in a country or society.',
  'A religion or members of a religion are stupid.',
  'A religion is inferior when compared to another religion.',
  'Religious practices of a particular faith should be restricted or banned in public spaces.',
  'Religious attire or symbols should be prohibited.',
  'A religion promotes violence or intolerance.',
  'Members of a religion are a threat.',
  'Members of a religion are not trustworthy or loyal.'
 ]


In [5]:
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_colwidth', 500)  # Display full text in columns

In [93]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Assuming you have already defined DEVICE and the convert_probabilities and standardise_results functions.

def bart_nli_batched(premises, hypotheses):
    model_name = 'facebook/bart-large-mnli'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model.to(DEVICE) 

    # Convert scalar inputs to lists
    if not isinstance(premises, (list, tuple)):
        premises = [premises]
    if not isinstance(hypotheses, (list, tuple)):
        hypotheses = [hypotheses]

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}

    for i in range(num_premises):
        labels = []
        probabilities_list = []

        for j in range(num_hypotheses):
            # Tokenize the input pair
            inputs = tokenizer(premises[i], hypotheses[j], return_tensors='pt', padding=True, truncation=True)

            # Run the input through the model
            logits = model(**inputs.to(DEVICE)).logits

            # Get probabilities and labels for the input
            probabilities = torch.softmax(logits, dim=1)
            label_mapping = ['contradiction', 'neutral', 'entailment']
            label = label_mapping[probabilities.argmax(dim=1)]

            probabilities = convert_probabilities(probabilities, label_mapping)

            labels.append(label)
            probabilities_list.append(standardise_results(probabilities))

        results[premises[i]] = (labels, probabilities_list)

    return results


In [94]:
df = df_muslim.copy()
results = bart_nli_batched(premises, hypotheses)

# Process the results separately and add them to the dataframe
labels_column = []
probabilities_column = []

for premise in df['comment_text']:
    if premise in results:
        labels, probabilities = results[premise]
    else:
        labels, probabilities = None, None
    labels_column.append(labels)
    probabilities_column.append(probabilities)

df['labels'] = labels_column
df['probabilities'] = probabilities_column

filtered_df = df.dropna(subset=['labels'])



In [97]:
filtered_df

Unnamed: 0,comment_text,identity_attack_x,bias,target,category,rationale,labels,probabilities
1,This whole tragedy is the face and body and physical manifestation of fundamental Islam. Striving to conquer subdue and to bring under control. Do not blame the world. Blame fundamental Islam for this tragedy.,0.5,bias,muslims,religion,prejudice against muslims,"[entailment, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, neutral, contradiction, entailment, entailment, contradiction]","[{'contradiction': 2.3, 'neutral': 21.4, 'entailment': 76.3}, {'contradiction': 97.3, 'neutral': 2.6, 'entailment': 0.2}, {'contradiction': 95.7, 'neutral': 4.0, 'entailment': 0.3}, {'contradiction': 98.7, 'neutral': 1.1, 'entailment': 0.2}, {'contradiction': 81.8, 'neutral': 16.9, 'entailment': 1.3}, {'contradiction': 47.0, 'neutral': 43.5, 'entailment': 9.5}, {'contradiction': 77.5, 'neutral': 22.1, 'entailment': 0.4}, {'contradiction': 68.3, 'neutral': 31.0, 'entailment': 0.8}, {'contradi..."
4,Another Muslim commits a terrorist act and democrats will once again blame the NRA. NYC New Jersey Minnesota Orlando San Bernardino. Get it yet?,0.5,bias,muslims,religion,prejudice against muslims,"[entailment, contradiction, contradiction, contradiction, contradiction, contradiction, neutral, contradiction, contradiction, neutral, neutral, entailment, entailment, neutral]","[{'contradiction': 7.1, 'neutral': 22.3, 'entailment': 70.6}, {'contradiction': 89.3, 'neutral': 10.4, 'entailment': 0.3}, {'contradiction': 77.7, 'neutral': 22.0, 'entailment': 0.3}, {'contradiction': 54.3, 'neutral': 45.5, 'entailment': 0.2}, {'contradiction': 73.7, 'neutral': 23.4, 'entailment': 3.0}, {'contradiction': 54.7, 'neutral': 42.3, 'entailment': 3.0}, {'contradiction': 28.1, 'neutral': 71.6, 'entailment': 0.2}, {'contradiction': 59.3, 'neutral': 39.5, 'entailment': 1.2}, {'contr..."
8,AntiMuslim protesters shout hateful rhetoric. Muslim fundamentalist nutjobs murder innocent people. Not quite the same thing is it?,0.8,bias,muslims,religion,prejudice against muslims,"[entailment, contradiction, neutral, neutral, neutral, neutral, contradiction, contradiction, neutral, neutral, neutral, entailment, entailment, neutral]","[{'contradiction': 21.8, 'neutral': 30.8, 'entailment': 47.4}, {'contradiction': 74.2, 'neutral': 24.1, 'entailment': 1.7}, {'contradiction': 37.5, 'neutral': 60.5, 'entailment': 2.0}, {'contradiction': 48.7, 'neutral': 50.1, 'entailment': 1.2}, {'contradiction': 13.4, 'neutral': 84.0, 'entailment': 2.6}, {'contradiction': 10.7, 'neutral': 79.1, 'entailment': 10.1}, {'contradiction': 59.9, 'neutral': 39.8, 'entailment': 0.3}, {'contradiction': 73.9, 'neutral': 25.0, 'entailment': 1.1}, {'con..."
17,I learned that those who are both religious and righteous are some of the most dangerous people on this earth. Yes radical Islam is very dangerous. NeverHillary,0.7,bias,muslims,religion,prejudice against muslims,"[neutral, contradiction, neutral, neutral, contradiction, contradiction, neutral, contradiction, contradiction, neutral, neutral, neutral, entailment, neutral]","[{'contradiction': 13.8, 'neutral': 73.6, 'entailment': 12.7}, {'contradiction': 94.1, 'neutral': 5.8, 'entailment': 0.1}, {'contradiction': 27.1, 'neutral': 72.0, 'entailment': 0.9}, {'contradiction': 22.6, 'neutral': 75.7, 'entailment': 1.7}, {'contradiction': 71.9, 'neutral': 25.6, 'entailment': 2.4}, {'contradiction': 61.2, 'neutral': 36.1, 'entailment': 2.7}, {'contradiction': 0.7, 'neutral': 96.1, 'entailment': 3.1}, {'contradiction': 93.3, 'neutral': 6.2, 'entailment': 0.5}, {'contrad..."
20,Wow how times have changed All hail the muslim. It wasnt so very long ago that Christians were told to remove all crosses from government properties in Hawaii and even some privately owned churches were heartily challenged. Now we have foreign born muslims on the front page and their right to be is heavily protected. Go figure.,0.8,bias,muslims,religion,prejudice against muslims,"[contradiction, contradiction, contradiction, neutral, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, neutral, contradiction, contradiction]","[{'contradiction': 78.6, 'neutral': 21.0, 'entailment': 0.3}, {'contradiction': 93.8, 'neutral': 6.1, 'entailment': 0.2}, {'contradiction': 99.1, 'neutral': 0.9, 'entailment': 0.0}, {'contradiction': 47.8, 'neutral': 49.9, 'entailment': 2.3}, {'contradiction': 96.0, 'neutral': 3.4, 'entailment': 0.6}, {'contradiction': 93.5, 'neutral': 5.3, 'entailment': 1.2}, {'contradiction': 89.0, 'neutral': 11.0, 'entailment': 0.0}, {'contradiction': 70.1, 'neutral': 29.3, 'entailment': 0.6}, {'contradic..."
56,PRips This sounds like a muslim trying to sound intelligent. Read the post again.,0.5,bias,muslims,religion,prejudice against muslims,"[neutral, contradiction, contradiction, contradiction, contradiction, contradiction, neutral, contradiction, neutral, neutral, neutral, neutral, neutral, neutral]","[{'contradiction': 36.2, 'neutral': 58.9, 'entailment': 5.0}, {'contradiction': 87.9, 'neutral': 11.9, 'entailment': 0.2}, {'contradiction': 85.3, 'neutral': 14.6, 'entailment': 0.2}, {'contradiction': 50.1, 'neutral': 49.5, 'entailment': 0.4}, {'contradiction': 77.0, 'neutral': 20.6, 'entailment': 2.4}, {'contradiction': 64.6, 'neutral': 33.2, 'entailment': 2.1}, {'contradiction': 10.1, 'neutral': 89.6, 'entailment': 0.2}, {'contradiction': 73.2, 'neutral': 25.7, 'entailment': 1.2}, {'contr..."
57,I got your point NeoNazi GOOD Muslim BAD...MAGA,0.685714,bias,muslims,religion,prejudice against muslims,"[entailment, contradiction, contradiction, neutral, contradiction, entailment, neutral, contradiction, entailment, neutral, neutral, neutral, entailment, neutral]","[{'contradiction': 2.1, 'neutral': 43.7, 'entailment': 54.2}, {'contradiction': 81.0, 'neutral': 18.0, 'entailment': 0.9}, {'contradiction': 85.8, 'neutral': 14.0, 'entailment': 0.2}, {'contradiction': 41.1, 'neutral': 58.7, 'entailment': 0.2}, {'contradiction': 49.7, 'neutral': 34.9, 'entailment': 15.4}, {'contradiction': 8.1, 'neutral': 31.9, 'entailment': 60.1}, {'contradiction': 31.1, 'neutral': 68.7, 'entailment': 0.2}, {'contradiction': 79.0, 'neutral': 19.6, 'entailment': 1.3}, {'cont..."
59,He was sick with hatred of white people who were his victims and should have never been aloud in this country cant you see hes a middle eastern muslim.,0.8,bias,muslims,religion,prejudice against muslims,"[neutral, contradiction, neutral, entailment, contradiction, neutral, neutral, contradiction, contradiction, entailment, neutral, entailment, neutral, contradiction]","[{'contradiction': 32.7, 'neutral': 33.9, 'entailment': 33.4}, {'contradiction': 82.0, 'neutral': 13.8, 'entailment': 4.2}, {'contradiction': 35.1, 'neutral': 51.8, 'entailment': 13.2}, {'contradiction': 0.9, 'neutral': 15.1, 'entailment': 83.9}, {'contradiction': 64.5, 'neutral': 33.2, 'entailment': 2.3}, {'contradiction': 46.5, 'neutral': 50.0, 'entailment': 3.5}, {'contradiction': 3.8, 'neutral': 59.3, 'entailment': 36.8}, {'contradiction': 62.6, 'neutral': 36.7, 'entailment': 0.7}, {'con..."
68,Since all do it...none are guilty or all guilty to the same degree? Maybe a couple of nuclear bombs should be dropped on a Muslim country.,0.7,bias,muslims,religion,prejudice against muslims,"[entailment, entailment, neutral, contradiction, contradiction, contradiction, contradiction, contradiction, contradiction, neutral, neutral, neutral, entailment, neutral]","[{'contradiction': 1.6, 'neutral': 11.9, 'entailment': 86.6}, {'contradiction': 3.3, 'neutral': 18.4, 'entailment': 78.3}, {'contradiction': 12.4, 'neutral': 52.4, 'entailment': 35.2}, {'contradiction': 99.9, 'neutral': 0.1, 'entailment': 0.0}, {'contradiction': 48.7, 'neutral': 31.5, 'entailment': 19.8}, {'contradiction': 65.9, 'neutral': 30.1, 'entailment': 4.1}, {'contradiction': 83.4, 'neutral': 15.8, 'entailment': 0.9}, {'contradiction': 71.6, 'neutral': 27.3, 'entailment': 1.1}, {'cont..."
71,Its not absurd to say a philosophy which calls for the death or enslavement of everyone else except Muslims has an influence. Islam is problematic as a belief system. All the Abrahamic religions are nonsense. STOP SPREADING LIES I blame the media for giving credence to such nonsense supernatural garbage clearly not real.,0.515152,bias,muslims,religion,prejudice against muslims,"[entailment, contradiction, neutral, neutral, contradiction, neutral, neutral, neutral, contradiction, neutral, neutral, entailment, entailment, neutral]","[{'contradiction': 3.5, 'neutral': 7.1, 'entailment': 89.4}, {'contradiction': 88.3, 'neutral': 10.4, 'entailment': 1.4}, {'contradiction': 29.8, 'neutral': 61.6, 'entailment': 8.6}, {'contradiction': 35.1, 'neutral': 62.1, 'entailment': 2.9}, {'contradiction': 69.1, 'neutral': 28.6, 'entailment': 2.3}, {'contradiction': 22.6, 'neutral': 63.7, 'entailment': 13.8}, {'contradiction': 1.7, 'neutral': 97.8, 'entailment': 0.4}, {'contradiction': 37.9, 'neutral': 54.3, 'entailment': 7.8}, {'contra..."


In [80]:
def bart_nli_batched(premises, hypotheses, batch_size=32):
    model_name = 'facebook/bart-large-mnli'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # Convert scalar inputs to lists
    if not isinstance(premises, (list, tuple)):
        premises = [premises]
    if not isinstance(hypotheses, (list, tuple)):
        hypotheses = [hypotheses]

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}


    for i in range(0, num_premises, batch_size):
        batch_premises = premises[i:i + batch_size]
        batch_results_labels = []
        batch_results_probabilities = []


        for j in range(0, num_hypotheses, batch_size):
            batch_hypotheses = hypotheses[j:j + batch_size]
            batch_labels = []
            batch_probabilities_list = []

            for k in range(len(batch_premises)):
                labels = []
                probabilities_list = []

                for l in range(len(batch_hypotheses)):
                    # Tokenize the input pair
                    inputs = tokenizer(batch_premises[k], batch_hypotheses[l], return_tensors='pt', padding=True, truncation=True)

                    # Run the input through the model
                    logits = model(**inputs.to(DEVICE)).logits

                    # Get probabilities and labels for the input
                    probabilities = torch.softmax(logits, dim=1)
                    label_mapping = ['contradiction', 'neutral', 'entailment']
                    label = label_mapping[probabilities.argmax(dim=1)]

                    # convert probabilities to percentages
                    probabilities = convert_probabilities(probabilities, label_mapping)

                    labels.append(label)
                    probabilities_list.append(probabilities)

                batch_labels.append(labels)
                batch_probabilities_list.append(probabilities_list)

            batch_results_labels.append(batch_labels)
            batch_results_probabilities.append(batch_probabilities_list)


    return batch_results_labels, batch_results_probabilities


In [81]:
l, p = bart_nli_batched(premises, hypotheses)


In [88]:
print(l)


[[['entailment', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'neutral', 'neutral', 'entailment', 'entailment', 'neutral'], ['entailment', 'contradiction', 'contradiction', 'contradiction', 'neutral', 'contradiction', 'contradiction', 'neutral', 'contradiction', 'neutral', 'neutral', 'entailment', 'entailment', 'neutral'], ['entailment', 'contradiction', 'neutral', 'contradiction', 'neutral', 'neutral', 'neutral', 'neutral', 'entailment', 'neutral', 'neutral', 'neutral', 'entailment', 'neutral'], ['entailment', 'neutral', 'entailment', 'contradiction', 'contradiction', 'neutral', 'neutral', 'contradiction', 'contradiction', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral'], ['contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'contradiction', 'neutral', 'contradiction', 'neutral', 'neutral', 'contradiction'],

In [62]:
def bart_nli_batched(premises, hypotheses, batch_size=32):
    model_name = 'facebook/bart-large-mnli'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # Convert scalar inputs to lists
    if not isinstance(premises, (list, tuple)):
        premises = [premises]
    if not isinstance(hypotheses, (list, tuple)):
        hypotheses = [hypotheses]

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}

    for i in range(0, num_premises, batch_size):
        batch_premises = premises[i:i + batch_size]
        batch_results = []

        for j in range(0, num_hypotheses, batch_size):
            batch_hypotheses = hypotheses[j:j + batch_size]

            # Tokenize the batch of input pairs
            batch_inputs = tokenizer.batch_encode_plus(
                batch_premises,
                batch_hypotheses,
                return_tensors='pt',
                padding=True,
                truncation=True,
            )

            # Run the batch through the model
            logits = model(**batch_inputs.to(DEVICE)).logits

            # Get probabilities and labels for the batch
            probabilities = torch.softmax(logits, dim=1)
            label_mapping = ['contradiction', 'neutral', 'entailment']
            labels = [label_mapping[p.argmax()] for p in probabilities]

            batch_results.append(list(zip(labels, probabilities)))

        # Unroll the batch results and store them for each premise
        for j, premise_result in enumerate(batch_results):
            premise = batch_premises[j]
            results[premise] = premise_result

    return results
