# Adversarial Debiasing

https://github.com/Trusted-AI/AIF360
https://arxiv.org/abs/1801.07593



@misc{aif360-oct-2018,
    title = "{AI Fairness} 360:  An Extensible Toolkit for Detecting, Understanding, and Mitigating Unwanted Algorithmic Bias",
    author = {Rachel K. E. Bellamy and Kuntal Dey and Michael Hind and
	Samuel C. Hoffman and Stephanie Houde and Kalapriya Kannan and
	Pranay Lohia and Jacquelyn Martino and Sameep Mehta and
	Aleksandra Mojsilovic and Seema Nagar and Karthikeyan Natesan Ramamurthy and
	John Richards and Diptikalyan Saha and Prasanna Sattigeri and
	Moninder Singh and Kush R. Varshney and Yunfeng Zhang},
    month = oct,
    year = {2018},
    url = {https://arxiv.org/abs/1810.01943}
}

In [3]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")

from datasets import load_dataset

from aif360.datasets import StructuredDataset
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector

from aif360.algorithms.inprocessing.adversarial_debiasing import AdversarialDebiasing

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

import tensorflow.compat.v1 as tf
import numpy as np

tf.disable_eager_execution()

sess = tf.Session()

In [10]:
from datasets import load_dataset

train_dataset = load_dataset("hatexplain", split="train")
train_dataset = train_dataset.map(lambda e: {"label" : int(np.median(e["annotators"]["label"]))})
print(train_dataset[0])

Reusing dataset hatexplain (C:\Users\Ahmad\.cache\huggingface\datasets\hatexplain\plain_text\1.0.0\df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)
100%|██████████| 15383/15383 [00:04<00:00, 3162.12ex/s]

{'id': '23107796_gab', 'annotators': {'label': [0, 2, 2], 'annotator_id': [203, 204, 233], 'target': [['Hindu', 'Islam'], ['Hindu', 'Islam'], ['Hindu', 'Islam', 'Other']]}, 'rationales': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'post_tokens': ['u', 'really', 'think', 'i', 'would', 'not', 'have', 'been', 'raped', 'by', 'feral', 'hindu', 'or', 'muslim', 'back', 'in', 'india', 'or', 'bangladesh', 'and', 'a', 'neo', 'nazi', 'would', 'rape', 'me', 'as', 'well', 'just', 'to', 'see', 'me', 'cry'], 'label': 2}





In [31]:
train_dataset = load_dataset("hatexplain", split="train")
test_dataset = load_dataset("hatexplain", split="test")
val_dataset = load_dataset("hatexplain", split="validation")

class_names = ["hate speech", "normal", "offensive"]
targets = []

protected_groups = {'Homosexual', 'Indian', 'Refugee', 'Hispanic', 'Asexual', 'Islam', 'African', 'Hindu', 'Disability', 'Other', 'Asian', 'Christian', 'Minority', 'Indigenous', 'Bisexual', 'Heterosexual', 'Buddhism', 'None', 'Arab', 'Nonreligious', 'Caucasian', 'Economic', 'Women', 'Jewish'}

#train_tokens = tokenizer(train_dataset['post_tokens'], padding=True,is_split_into_words=True, return_tensors="pt")
#test_tokens = tokenizer(test_dataset['post_tokens'], padding=True,is_split_into_words=True, return_tensors="pt")
#val_tokens = tokenizer(val_dataset['post_tokens'], padding=True,is_split_into_words=True, return_tensors="pt")

Reusing dataset hatexplain (C:\Users\Ahmad\.cache\huggingface\datasets\hatexplain\plain_text\1.0.0\df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)
Reusing dataset hatexplain (C:\Users\Ahmad\.cache\huggingface\datasets\hatexplain\plain_text\1.0.0\df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)
Reusing dataset hatexplain (C:\Users\Ahmad\.cache\huggingface\datasets\hatexplain\plain_text\1.0.0\df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)


{'Homosexual', 'Indian', 'Refugee', 'Hispanic', 'Asexual', 'Islam', 'African', 'Hindu', 'Disability', 'Other', 'Men', 'Asian', 'Christian', 'Minority', 'Indigenous', 'Bisexual', 'Heterosexual', 'Buddhism', 'None', 'Arab', 'Nonreligious', 'Caucasian', 'Economic', 'Women', 'Jewish'}


In [None]:
debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='debiased_classifier',
                          debias=True,
                          sess=sess)