In [3]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import torch
import os
from datasets import load_dataset 
from nlpaug.util import Action
import spacy 
from spacy import displacy
from langchain.agents import create_pandas_dataframe_agent, load_tools
from langchain.llms import OpenAI
from getpass import getpass

In [4]:
text = "I think that all African-Americans should deserve jail time."

In [5]:
print(text)

I think that all African-Americans should deserve jail time.


In [6]:
augmentor = naw.AntonymAug()
augmentor.augment(text.split(" ")[4])

['African-Americans']

Both Synonym and Antonym augmentation do not work very well with race. Use embedding context

In [7]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
aug.augment(text.split(" ")[4])

['African-Americans']

Not very successful; need to do it manually

In [62]:
os.environ['OPENAI_API_KEY'] = getpass("OpenAI Key:")

OpenAI Key: ········


In [65]:
agent = OpenAI(temperature=0)

In [None]:
agent("generate 50 ethnicities and races")

In [8]:
bias_elements = [
    "African",
    "Afro-Caribbean",
    "Afro-Latino",
    "Ainu",
    "Amazigh",
    "Ashkenazi Jewish",
    "Australian Aboriginal",
    "Basque",
    "Bantu",
    "Bengali",
    "Berber",
    "Black",
    "Chinese",
    "Cree",
    "Cuban",
    "Druze",
    "Dutch",
    "English",
    "Filipino",
    "Finnish",
    "French",
    "Gaelic",
    "German",
    "Greek",
    "Guatemalan",
    "Han Chinese",
    "Hausa",
    "Hawaiian",
    "Hispanic/Latino",
    "Hmong",
    "Hopi",
    "Inuit",
    "Italian",
    "Japanese",
    "Jewish",
    "Karen",
    "Khmer",
    "Korean",
    "Maori",
    "Maya",
    "Mongolian",
    "Native American",
    "Nigerian",
    "Nubian",
    "Pashtun",
    "Persian",
    "Quechua",
    "Romani",
    "Sami",
    "Somali",
    "Tajik",
    "Tamil",
    "Tatar",
    "Thai",
    "Tibetan",
    "Tuareg",
    "Turkish",
    "Uighur",
    "Ukrainian",
    "Vietnamese",
    "Yakut",
    "Yoruba",
    "Zulu",
    "Albanian",
    "Arab",
    "Armenian",
    "Assyrian",
    "Aymara",
    "Balinese",
    "Bashkir",
    "Belizean",
    "Bolivian",
    "Bosniak",
    "Bulgarian",
    "Cambodian",
    "Cameroonian",
    "Catalan",
    "Chamorro",
    "Chechen",
    "Cherokee",
    "Chuvash",
    "Coptic",
    "Corsican",
    "Crimean Tatar",
    "Croatian",
    "Czech",
    "Danish",
    "Dinka",
    "Ecuadorian",
    "Estonian",
    "Ethiopian",
    "Fijian",
    "Georgian",
    "Gujarati",
    "Haitian",
    "Hazaras",
    "Ibo",
    "Icelandic",
    "Indigenous Australian",
    "Indigenous Malaysian",
    "Iraqi",
    "Iroquois",
    "Kurdish",
    "Latvian",
    "Lebanese",
    "Lithuanian",
    "Macedonian",
    "Malay",
    "Maldivian",
    "Maltese",
    "Maasai",
    "Mende",
    "Mien",
    "Mizrahi Jewish",
    "Monguor",
    "Moroccan",
    "Navajo",
    "Nenets",
    "Nepali",
    "Norwegian",
    "Pakistani",
    "Palestinian",
    "Papua New Guinean",
    "Parsi",
    "Peruvian",
    "Polish",
    "Portuguese",
    "Punjabi",
    "Roma",
    "Samoan",
    "Scots",
    "Sindhi",
    "Slovak",
    "Slovene",
    "Sorbian",
    "Sudanese",
    "Swedish",
]

In [9]:
base_dataset = load_dataset("ccdv/govreport-summarization")
base_dataset

DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})

Procedure:
- Iterate through each data entry, and do spaCy named entity recognition for race.
- Once found, take entry, and copy in word from list of bias elements to generate new entry.
- Each entry will then be augmented in size 

In [10]:
ner = spacy.load("en_core_web_sm")

In [11]:
seg = ner(text)

In [12]:
for word in seg.ents:
    print(word.text, word.label_)

African-Americans NORP


In [13]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [22]:
base_dataset.set_format(type='pandas')

df = base_dataset['train'][:]

In [27]:
df.drop(['summary'], axis=1)

Unnamed: 0,report
0,The structure of the armed forces is based on ...
1,Most income derived from private sector busine...
2,There are some similarities in how Medicare pa...
3,IRS’s mission is to provide America’s taxpayer...
4,"In 1991, we reported that, historically, INS l..."
...,...
17512,Some Members of Congress have expressed intere...
17513,"Since November 1986, the Commemorative Works A..."
17514,"On May 27, 2015, the Army Corps of Engineers (..."
17515,Section 1512 applies to the obstruction of fed...


In [None]:
def raceDetection:
    for entry in 