In [2]:
import sys
sys.path.append('src/')
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import evaluate

# panda view settings
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_colwidth', 500)  # Display full text in columns

In [3]:
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

print(DEVICE)

cpu


## Read data

In [4]:
df_train = pd.read_csv("./data/toxicbias_train.csv")
df_test = pd.read_csv("./data/toxicbias_test.csv")

# split up train data into bias and neutral
df_bias = df_train[df_train['bias'] == 'bias']
df_neutral = df_train[df_train['bias'] == 'neutral']

# add extra neutral cases from test data
df_test_neutral = df_test[df_test['bias'] == 'neutral']
df_neutral = pd.concat([df_neutral, df_test_neutral], ignore_index=True)


## Split up dataframe by category

In [5]:
# store unique categories
unique_categories = set()

# Iterate through each entry in the 'category' column
for categories in df_bias['category'].str.split(','):
    for category in categories:
        stripped_category = category.strip()
        if stripped_category and stripped_category.lower() != 'none':
            unique_categories.add(stripped_category)

unique_categories_list = sorted(list(unique_categories))

In [6]:
categorical_dfs = {category: pd.DataFrame(columns=df_bias.columns) for category in unique_categories_list}

# Split categories and add rows to the corresponding dataframes in dictionary
def split_categories_and_add_rows(row):
    categories = row['category'].split(',')
    for category in categories:
        category = category.strip()  # Remove leading/trailing spaces
        if category in categorical_dfs:
            categorical_dfs[category] = categorical_dfs[category].append(row, ignore_index=True)

df_bias.apply(split_categories_and_add_rows, axis=1)

# Print the shape of each category dataframe
for category, category_df in categorical_dfs.items():
    print(f"Dataframe for {category}:")
    print(f"Shape: {category_df.shape}")
    print("\n")

Dataframe for gender:
Shape: (293, 6)


Dataframe for lgbtq:
Shape: (453, 6)


Dataframe for political:
Shape: (172, 6)


Dataframe for race:
Shape: (1768, 6)


Dataframe for religion:
Shape: (1257, 6)




## Split training and test data

In [7]:
training_data = {}
test_data = {}

# Perform the 80-20 split for each category dataframe
for category, df in categorical_dfs.items():
    train_df, test_df = train_test_split(df, test_size=0.8, random_state=42)
    
    training_data[category] = train_df
    test_data[category] = test_df

In [8]:
for category, train_df in training_data.items():
    print('Training data')
    print(f"Dataframe for {category}:")
    print(f"Shape: {train_df.shape}")
    print("\n")

Training data
Dataframe for gender:
Shape: (58, 6)


Training data
Dataframe for lgbtq:
Shape: (90, 6)


Training data
Dataframe for political:
Shape: (34, 6)


Training data
Dataframe for race:
Shape: (353, 6)


Training data
Dataframe for religion:
Shape: (251, 6)




In [9]:
for category, test_df in test_data.items():
    print('Test data')
    print(f"Dataframe for {category}:")
    print(f"Shape: {test_df.shape}")
    print("\n")

Test data
Dataframe for gender:
Shape: (235, 6)


Test data
Dataframe for lgbtq:
Shape: (363, 6)


Test data
Dataframe for political:
Shape: (138, 6)


Test data
Dataframe for race:
Shape: (1415, 6)


Test data
Dataframe for religion:
Shape: (1006, 6)




## Hypotheses

In [10]:
training_data['gender'].sample(5)

Unnamed: 0,comment_text,identity_attack_x,bias,target,category,rationale
249,Women are paid less than men. That is a fact. Your comment makes no sense.,0.259259,bias,women,gender,prejudice against women
130,How do we fight agaisnt women who use sexual favours to get things they want from men?,0.5,bias,women,gender,prejudice against women
99,Why are most conservatives against actual people? No heart? Gays shouldnt have any rights Women are too stupid to control their own bodies. But love the orange toddler,0.608696,bias,"gays,women","lgbtq,gender",prejudice against gays and women
191,This is not and never has been a case of white mans abuses of natives for sex but of natives abuses of white man for profit.,0.5,bias,"whites,men","race,gender",prejudice against white men
280,Nothing is true for everybody. But we need to have rules and common practice is to establish them based on what is commonly understood not the exceptions like effeminate men.,0.5,bias,men,gender,prejudice against men


In [13]:
hypotheses = {
  'gender' : [
    'A person belonging to a gender lacks intelligence.',
    'A person belonging to a gender is a criminal.',
    'A person belonging to a gender is violent.',
    'Violence is encouraged against a gender.',
    'A person belonging to a gender is abusive.',
    'A person belonging to a gender should be blamed.',
    'A person belonging to a gender is toxic.',
    'A gender is unable to do some jobs.',
    'A gender is unable to do something as well as another gender.',
    'A person belonging to a gender is effeminate.',
    'A person belonging to a gender is compared to an object.',
    'A person belonging to a gender uses or should be used for sex.',
    'A person belonging to a gender has an advantage compared to others.',
    'A person belonging to a gender is misogynist.',
    'A person belonging to a gender is disliked.',
  ],
  'lgbtq' : [
    'Lgbtq persons are believed to have a condition.',
    'Lgbtq should not be part of society.',
    'Violence is encouraged against lgbtq persons.',
    'Lgbtq persons are compared to animals.',
    'Lgbtq persons are believed to have an agenda.',
    'Lgbtq persons should be excluded.',
    'Lgbtq persons should stay home.',
    'Lgbtq persons commit crimes.',
    'Lgbtq persons get preferred to other persons.',
    'Lgbtq persons are seen as not being normal.', 
    'Lgbtq persons should not be allowed to marry.', 
    'Lgbtq persons have a disease.',
    'Lgbtq persons are seen as being disgusting.',
    'Lgbtq persons are not tolerated or desired.',
    'Lgbtq persons are treated differently.',

  ],
  'political' : [ 
      'A person belonging to a political group is stupid.',
      'A person belonging to a political group is crazy.',
      'Things are worse due to a political group.',
      'Members of a political group are lgbtq.',
      'Members of a political group are hypocritical.',
      'Members of a political group dislike a gender or lgbtq.',
      'Members of a political group commit crimes.',
      'Members of a political group are violent.',
      'Members of a political group are hateful.',
      'Members of a political group are compared to Nazis.',
      'Members of a political group are racist.',
      'A person belonging to a political group has a disease.',
      'Members of a political group are taking control.',
      'Members of a political group prefer minorities.',
      'Members of a political group hate their country or race.',
  ],
  'race' : [
    'Members of a race commit crimes.',
    'Members of a race are violent.',
    'Members of a race are compared to animals.',
    'The use of violence is encouraged or desired against a race.',
    'Members of a race should remain in their respective countries.',
    'The integration of a race should be avoided in a country or society.',
    'Members of a race are a threat to others.',
    'A race is inferior when compared to another race.',
    'A race is preferred to another.',
    'Members of a race are stupid.',
    'There are too many members of a race.',
    'Members of a race should be excluded.',
    'Members of a race should go back to their homes.',
    'Members of a race are refused.',
    'Members of a race are racist.',
  ],
  'religion' : [
    'The use of violence is encouraged or desired against a religion.',
    'A religion should be discriminated against.',
    'Members of a religion should be excluded and marginalized.',
    'Members of a religion should remain in their respective countries.',
    'Members of a religion are compared to animals.',
    'A religion or members of a religion are compared to devils or demons.',
    'The integration of a religion should be avoided in a country or society.',
    'A religion or members of a religion are stupid.',
    'A religion is inferior when compared to another religion.',
    'A religion is preferred to another.',
    'Religious practices of a particular faith should be restricted or banned in public spaces.',
    'Religious attire or symbols should be prohibited.',
    'A religion promotes violence or intolerance.',
    'Members of a religion are a threat.',
    'Members of a religion are not trustworthy or loyal.',
  ],
}

## Add neutral cases

In [14]:
def add_rows_to_dataframes(dataframes_dict):
    # Set the random state for reproducibility
    np.random.seed(42)
    
    df_neutral_copy = df_neutral.copy()
    max_neutral_length = len(df_neutral_copy)

    for key, df in dataframes_dict.items():
        # Calculate the maximum number of rows to add
        rows_to_add = len(df)

        # check if neutral copy array is too low on rows
        if len(df_neutral_copy) < rows_to_add:
            # reset array to full size
            df_neutral_copy = df_neutral.copy()

        # check if we need to drop bias data to match number of neutral cases
        if rows_to_add > max_neutral_length:
            # Drop excess rows from df to match the length of df_neutral_copy
            rows_to_drop = np.random.choice(df.index, size=len(df) - len(df_neutral_copy), replace=False)
            df.drop(rows_to_drop, inplace=True)

        # Choose random rows from df_neutral_copy without replacement
        rows_to_add = df_neutral_copy.sample(n=len(df), replace=False)

        df = pd.concat([df, rows_to_add], ignore_index=True)

         # drop these rows from df_neutral_copy after concatenation
        df_neutral_copy.drop(rows_to_add.index, inplace=True)

        # Update the dataframe in the dataframes_dict
        dataframes_dict[key] = df


In [15]:
add_rows_to_dataframes(test_data)
add_rows_to_dataframes(training_data)

In [16]:
for category in test_data:
  print(test_data[category]['bias'].value_counts())

bias       235
neutral    235
Name: bias, dtype: int64
neutral    363
bias       363
Name: bias, dtype: int64
bias       138
neutral    138
Name: bias, dtype: int64
bias       998
neutral    998
Name: bias, dtype: int64
bias       998
neutral    998
Name: bias, dtype: int64


In [17]:
for category in training_data:
  print(training_data[category]['bias'].value_counts())

bias       58
neutral    58
Name: bias, dtype: int64
bias       90
neutral    90
Name: bias, dtype: int64
neutral    34
bias       34
Name: bias, dtype: int64
neutral    353
bias       353
Name: bias, dtype: int64
bias       251
neutral    251
Name: bias, dtype: int64


## Deberta v3 model

In [18]:
def convert_probabilities_batched(probabilities, label_mapping):
    probabilities_list = probabilities.tolist()  # Convert the tensor to a list
    return [{name: round(pred * 100, 1) for pred, name in zip(preds, label_mapping)} for preds in probabilities_list]

In [19]:
def deberta_v3_nli_batched_hypotheses(premises, hypotheses):
    model_name = 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model.to(DEVICE) 

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}

    with torch.no_grad():
        for i in range(num_premises):
            labels = []

            # Tokenize the input pair
            inputs = tokenizer([premises[i]] * num_hypotheses, hypotheses, return_tensors='pt', padding=True, truncation=True)

            # Run the input through the model
            logits = model(**inputs.to(DEVICE)).logits

            # Get probabilities and labels for the input
            probabilities = torch.softmax(logits, dim=1)
            label_mapping = ["entailment", "neutral", "contradiction"]
            labels = [label_mapping[p.argmax().item()] for p in probabilities]

            results[premises[i]] = (labels, convert_probabilities_batched(probabilities, label_mapping))
    return results


In [20]:
def run_deberta_model_add_labels(category,data_type='training'):
    # Create a copy of the input dataframe
    if data_type == 'test':
      df_copy = test_data[category].copy()
    if data_type == 'training':
      df_copy = training_data[category].copy()

    # Run BART NLI on the dataframes for the specified category
    results = deberta_v3_nli_batched_hypotheses(list(df_copy['comment_text']), hypotheses[category])

    # Process the results separately and add them to the dataframe
    labels_column = []
    probabilities_column = []

    for premise in df_copy['comment_text']:
        if premise in results:
            labels, probabilities = results[premise]
        else:
            labels, probabilities = None, None
        labels_column.append(labels)
        probabilities_column.append(probabilities)

    df_copy['labels'] = labels_column
    df_copy['probabilities'] = probabilities_column

    return df_copy

In [21]:
df_pol = run_deberta_model_add_labels('political', 'training')
df_race = run_deberta_model_add_labels('race', 'training')
df_gender = run_deberta_model_add_labels('gender', 'training')
df_religion = run_deberta_model_add_labels('religion', 'training')
df_lgbtq = run_deberta_model_add_labels('lgbtq', 'training')



KeyboardInterrupt: 

In [None]:
dfs = [df_pol, df_race, df_gender, df_religion, df_lgbtq]

combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('results/training/debertav3/combined_deberta_trainng_results.csv')

## Bart model

In [None]:
def bart_nli_batched_hypotheses(premises, hypotheses):
    model_name = 'facebook/bart-large-mnli'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model.to(DEVICE) 

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}

    with torch.no_grad():
        for i in range(num_premises):
            labels = []

            # Tokenize the input pair
            inputs = tokenizer([premises[i]] * num_hypotheses, hypotheses, return_tensors='pt', padding=True, truncation=True)

            # Run the input through the model
            logits = model(**inputs.to(DEVICE)).logits

            # Get probabilities and labels for the input
            probabilities = torch.softmax(logits, dim=1)
            label_mapping = ['contradiction', 'neutral', 'entailment']
            labels = [label_mapping[p.argmax().item()] for p in probabilities]

            results[premises[i]] = (labels, convert_probabilities_batched(probabilities, label_mapping))
    return results


In [None]:
def run_bart_model_add_labels(category,data_type='training'):
    # Create a copy of the input dataframe
    if data_type == 'test':
      df_copy = test_data[category].copy()
    if data_type == 'training':
      df_copy = training_data[category].copy()

    # Run BART NLI on the dataframes for the specified category
    results = bart_nli_batched_hypotheses(list(df_copy['comment_text']), hypotheses[category])

    # Process the results separately and add them to the dataframe
    labels_column = []
    probabilities_column = []

    for premise in df_copy['comment_text']:
        if premise in results:
            labels, probabilities = results[premise]
        else:
            labels, probabilities = None, None
        labels_column.append(labels)
        probabilities_column.append(probabilities)

    df_copy['labels'] = labels_column
    df_copy['probabilities'] = probabilities_column

    return df_copy

In [None]:
df_pol = run_bart_model_add_labels('political', 'training')
df_race = run_bart_model_add_labels('race', 'training')
df_gender = run_bart_model_add_labels('gender', 'training')
df_religion = run_bart_model_add_labels('religion', 'training')
df_lgbtq = run_bart_model_add_labels('lgbtq', 'training')

In [None]:
df_pol.to_csv('results/training/bart-large/political_training_results.csv')
df_race.to_csv('results/training/bart-large/race_training_results.csv')
df_gender.to_csv('results/training/bart-large/gender_training_results.csv')
df_religion.to_csv('results/training/bart-large/religion_training_results.csv')
df_lgbtq.to_csv('results/training/bart-large/lgbtq_training_results.csv')

## Get test results

In [None]:
df_pol = run_deberta_model_add_labels('political', 'test')
df_race = run_deberta_model_add_labels('race', 'test')
df_gender = run_deberta_model_add_labels('gender', 'test')
df_religion = run_deberta_model_add_labels('religion', 'test')
df_lgbtq = run_deberta_model_add_labels('lgbtq', 'test')

In [None]:
df_pol.to_csv('results/test/debertav3/political_labels_deberta_test.csv')
df_race.to_csv('results/test/debertav3/race_labels_deberta_test.csv')
df_gender.to_csv('results/test/debertav3/gender_labels_deberta_test.csv')
df_religion.to_csv('results/test/debertav3/religion_labels_deberta_test.csv')
df_lgbtq.to_csv('results/test/debertav3/lgbtq_labels_deberta_test.csv')

dfs = [df_pol, df_race, df_gender, df_religion, df_lgbtq]

# Concatenate the dataframes vertically
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('results/test/debertav3/combined_test_results_deberta.csv')

In [None]:
df_pol = run_bart_model_add_labels('political', 'test')
df_race = run_bart_model_add_labels('race', 'test')
df_gender = run_bart_model_add_labels('gender', 'test')
df_religion = run_bart_model_add_labels('religion', 'test')
df_lgbtq = run_bart_model_add_labels('lgbtq', 'test')

In [None]:
df_pol.to_csv('results/test/bart-large/political_test_results.csv')
df_race.to_csv('results/test/bart-large/race_test_results.csv')
df_gender.to_csv('results/test/bart-large/gender_test_results.csv')
df_religion.to_csv('results/test/bart-large/religion_test_results.csv')
df_lgbtq.to_csv('results/test/bart-large/lgbtq_test_results.csv')

dfs = [df_pol, df_race, df_gender, df_religion, df_lgbtq]

# Concatenate the dataframes vertically
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('results/test/bart-large/combined_test_results.csv')


## Fine tuning

In [None]:
import random

sentences = [
    "Members of a race commit crimes.",
    "Members of a race are violent.",
    "Members of a race are compared to animals.",
    "The use of violence is encouraged or desired against a race.",
    "Members of a race should remain in their respective countries.",
    "The integration of a race should be avoided in a country or society.",
    "Members of a race are a threat to others.",
    "A race is inferior when compared to another race.",
    "A race is preferred to another.",
    "Members of a race are stupid.",
    "There are too many members of a race.",
    "Members of a race should be excluded.",
    "Members of a race should go back to their homes.",
    "Members of a race are refused.",
    "Members of a race are racist."
]

num_sentences_to_print = 1300

for _ in range(num_sentences_to_print):
    random_sentence = random.choice(hypotheses['religion'])
    print(random_sentence)


In [3]:
input_file = "data/fine_tuning_balanced_religion.csv"

fine_tuning = pd.read_csv(input_file)

# Drop rows where the 'hypothesis' column is empty
fine_tuning = fine_tuning.dropna(subset=['hypothesis'])


In [4]:
# Check for null values in the 'Label' column
null_rows = fine_tuning[fine_tuning['label'].isnull() | fine_tuning['hypothesis'].isnull() | fine_tuning['premise'].isnull()]

In [5]:
model_name = 'facebook/bart-large-mnli'

In [6]:
## split data into training and validation 
train_dataset, val_dataset = train_test_split(fine_tuning, test_size=0.1, random_state=42)

train_dataset["label"] = [int(label) for label in train_dataset["label"]]
val_dataset["label"] = [int(label) for label in val_dataset["label"]]

# Create Hugging Face Dataset objects
train_data = Dataset.from_dict(train_dataset)

eval_data = Dataset.from_dict(val_dataset)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

def tokenize_function(examples):
    return tokenizer(examples["hypothesis"], padding="max_length", truncation=True)

train_data = train_data.map(tokenize_function, batched=True)
eval_data = eval_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/943 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# Set up Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,  # Decrease batch size
    fp16=True,  # Enable mixed precision training
    max_grad_norm=1.0,  # Implement gradient clipping
    eval_steps=100,  # Limit evaluation steps
    save_total_limit=1,  # Save only the best checkpoint to save memory
)

# Metric definition and computation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    inner_logits = logits[0]  # Extract the inner logits array
    predictions = np.argmax(inner_logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer with updated args
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


In [1]:
model.save_pretrained("fine-tuning/bart_fine_tuned_model")
tokenizer.save_pretrained("fine-tuning/bart_fine_tuned_tokenizer")

NameError: name 'model' is not defined

## Run fine tuned mode

In [9]:
def bart_nli_batched_hypotheses(premises, hypotheses):
    model = AutoModelForSequenceClassification.from_pretrained('fine-tuning/bart_fine_tuned_model')
    tokenizer = AutoTokenizer.from_pretrained('fine-tuning/bart_fine_tuned_tokenizer', use_fast=True)
    model.to(DEVICE) 

    num_premises = len(premises)
    num_hypotheses = len(hypotheses)
    results = {}

    with torch.no_grad():
        for i in range(num_premises):
            labels = []

            # Tokenize the input pair
            inputs = tokenizer([premises[i]] * num_hypotheses, hypotheses, return_tensors='pt', padding=True, truncation=True)

            # Run the input through the model
            logits = model(**inputs.to(DEVICE)).logits

            # Get probabilities and labels for the input
            probabilities = torch.softmax(logits, dim=1)
            label_mapping = ['contradiction', 'neutral', 'entailment']
            labels = [label_mapping[p.argmax().item()] for p in probabilities]

            results[premises[i]] = (labels, convert_probabilities_batched(probabilities, label_mapping))
    return results


In [None]:
df_religion = run_bart_model_add_labels('religion', 'test')

df_religion.to_csv('results/fine_tuning/bart-large/fine_tuning_religion_test_results.csv')