In [1]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import torch
import os
from datasets import load_dataset 
from nlpaug.util import Action
import spacy 
from spacy import displacy
from langchain.agents import create_pandas_dataframe_agent, load_tools
from langchain.llms import OpenAI
from getpass import getpass

In [2]:
text = "I think that all African-Americans should deserve jail time."

In [3]:
print(text)

I think that all African-Americans should deserve jail time.


In [4]:
augmentor = naw.AntonymAug()
augmentor.augment(text.split(" ")[4])

['African-Americans']

Both Synonym and Antonym augmentation do not work very well with race. Use embedding context

In [5]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
aug.augment(text.split(" ")[4])

['African-Americans']

Not very successful; need to do it manually

In [62]:
os.environ['OPENAI_API_KEY'] = getpass("OpenAI Key:")

OpenAI Key: ········


In [65]:
agent = OpenAI(temperature=0)

In [None]:
agent("generate 50 ethnicities and races")

In [6]:
bias_elements = [
    "African",
    "Afro-Caribbean",
    "Afro-Latino",
    "Ainu",
    "Amazigh",
    "Ashkenazi Jewish",
    "Australian Aboriginal",
    "Basque",
    "Bantu",
    "Bengali",
    "Berber",
    "Black",
    "Chinese",
    "Cree",
    "Cuban",
    "Druze",
    "Dutch",
    "English",
    "Filipino",
    "Finnish",
    "French",
    "Gaelic",
    "German",
    "Greek",
    "Guatemalan",
    "Han Chinese",
    "Hausa",
    "Hawaiian",
    "Hispanic/Latino",
    "Hmong",
    "Hopi",
    "Inuit",
    "Italian",
    "Japanese",
    "Jewish",
    "Karen",
    "Khmer",
    "Korean",
    "Maori",
    "Maya",
    "Mongolian",
    "Native American",
    "Nigerian",
    "Nubian",
    "Pashtun",
    "Persian",
    "Quechua",
    "Romani",
    "Sami",
    "Somali",
    "Tajik",
    "Tamil",
    "Tatar",
    "Thai",
    "Tibetan",
    "Tuareg",
    "Turkish",
    "Uighur",
    "Ukrainian",
    "Vietnamese",
    "Yakut",
    "Yoruba",
    "Zulu",
    "Albanian",
    "Arab",
    "Armenian",
    "Assyrian",
    "Aymara",
    "Balinese",
    "Bashkir",
    "Belizean",
    "Bolivian",
    "Bosniak",
    "Bulgarian",
    "Cambodian",
    "Cameroonian",
    "Catalan",
    "Chamorro",
    "Chechen",
    "Cherokee",
    "Chuvash",
    "Coptic",
    "Corsican",
    "Crimean Tatar",
    "Croatian",
    "Czech",
    "Danish",
    "Dinka",
    "Ecuadorian",
    "Estonian",
    "Ethiopian",
    "Fijian",
    "Georgian",
    "Gujarati",
    "Haitian",
    "Hazaras",
    "Ibo",
    "Icelandic",
    "Indigenous Australian",
    "Indigenous Malaysian",
    "Iraqi",
    "Iroquois",
    "Kurdish",
    "Latvian",
    "Lebanese",
    "Lithuanian",
    "Macedonian",
    "Malay",
    "Maldivian",
    "Maltese",
    "Maasai",
    "Mende",
    "Mien",
    "Mizrahi Jewish",
    "Monguor",
    "Moroccan",
    "Navajo",
    "Nenets",
    "Nepali",
    "Norwegian",
    "Pakistani",
    "Palestinian",
    "Papua New Guinean",
    "Parsi",
    "Peruvian",
    "Polish",
    "Portuguese",
    "Punjabi",
    "Roma",
    "Samoan",
    "Scots",
    "Sindhi",
    "Slovak",
    "Slovene",
    "Sorbian",
    "Sudanese",
    "Swedish",
]

In [7]:
base_dataset = load_dataset("ccdv/govreport-summarization")
base_dataset

DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})

Procedure:
- Iterate through each data entry, and do spaCy named entity recognition for race.
- Once found, take entry, and copy in word from list of bias elements to generate new entry.
- Each entry will then be augmented in size 

In [8]:
ner = spacy.load("en_core_web_sm")

In [9]:
seg = ner(text)

In [10]:
for word in seg.ents:
    print(word.text, word.label_)

African-Americans NORP


In [11]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [12]:
base_dataset.set_format(type='pandas')

df = base_dataset['train'][:1]

In [13]:
df = df.drop(['summary'], axis=1)

In [14]:
def raceDetection():

    numberOfEthnicities = 0

    for entry in df.report:
    
        seg = ner(entry)
    
        for word in seg.ents:
            if word.label_ == "NORP":
                numberOfEthnicities += 1


    return numberOfEthnicities

def datasetAugmentation():

    #Initialize summarizer
    sum = nas.AbstSummAug()

    #Initialize sentence augmentation
    aug = nas.ContextualWordEmbsForSentenceAug()
    
    #Iterate throguh each entry.
    for entry in df.report:

        #for each entry do Named Entity Recognition on each word
        seg = ner(entry)

        #find if any of the words pertain to a race
        for word in seg.ents:
            if word.label_ == "NORP":

                #if a word pertains to race, substitute in each entry of the bias elements
                for race in bias_elements:
                    newEntry = entry.replace(word.text, race)
                    df.loc[len(df.index)] = newEntry

                    #Do content downshift through summarization of new entry (abstractive summarization), and add
                    newEntrySum = sum.augment(newEntry)
                    df.loc[len(df.index)] = newEntrySum

                    #Do vocabulary upshift through augmentation, and add 
                    newEntryComp = aug.augment(newEntry)
                    df.loc[len(df.index)] = newEntryComp

In [None]:
entry = df.loc[0].at['report']
entry

In [None]:
aug = nas.AbstSummAug(model_path='facebook/bart-large-cnn')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]