In [24]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import torch
import os
from datasets import load_dataset 
from nlpaug.util import Action
import spacy 
from spacy import displacy
from langchain.llms import OpenAI
from getpass import getpass

In [2]:
bias_elements = [
    "African",
    "Asian",
    "Caucasian",
    "Hispanic",
    "Native American",
    "Pacific Islander",
    "Arab",
    "Jewish",
    "Indian",
    "Chinese",
    "Japanese",
    "Korean",
    "Vietnamese",
    "Mexican",
    "Irish",
    "Italian",
    "German",
    "Russian",
    "Greek",
    "Swedish"
]

In [None]:
base_dataset = load_dataset("ccdv/govreport-summarization")
base_dataset

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})

Procedure:
- Iterate through each data entry, and do spaCy named entity recognition for race.
- Once found, take entry, and copy in word from list of bias elements to generate new entry.
- Each entry will then be augmented in size 

In [4]:
ner = spacy.load("en_core_web_lg")

In [5]:
text = "I hate Indians"

In [6]:
seg = ner(text)

In [7]:
for word in seg.ents:
    print(word.text, word.label_)

Indians NORP


In [8]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [27]:
base_dataset.set_format(type='pandas')

df = base_dataset['train'][:50]

In [28]:
df = df.drop(['summary'], axis=1)

In [33]:
df['report'][3]

'IRS’s mission is to provide America’s taxpayers top-quality service by helping them to understand and meet their tax responsibilities and to enforce the law with integrity and fairness to all. During fiscal year 2015, IRS collected more than $3.3 trillion; processed more than 243 million tax returns and other forms; and issued more than $403 billion in tax refunds. IRS employs about 90,000 people in its Washington, D.C., headquarters and at more than 550 offices in all 50 states, U.S. territories, and some U.S. embassies and consulates. Each filing season IRS provides assistance to tens of millions of taxpayers over the phone, through written correspondence, online, and face-to-face. The scale of these operations alone presents challenges. In carrying out its mission, IRS relies extensively on computerized information systems, which it must effectively secure to protect sensitive financial and taxpayer data for the collection of taxes, processing of tax returns, and enforcement of fed

In [11]:
def dataSplit(dset):
    
    chunk_size = 512
    splitList = []
    
    for entry in dset.report:
        
        for i in range(0, len(entry), chunk_size):
            chunk = entry[i:i + chunk_size]
            splitList.append(chunk)

    sp = pd.DataFrame(splitList)
    sp.columns = ['report']
    return sp

In [12]:
def datasetAugmentation(df):

    subData = dataSplit(df)

    #Initialize summarizer
    sum = nas.AbstSummAug(model_path='facebook/bart-large-cnn')

    #Initialize sentence augmentation
    aug = naw.ContextualWordEmbsAug(model_path='distilroberta-base', aug_p = 100)
    
    #Iterate throguh each entry.
    for entry in subData.report:

        #for each entry do Named Entity Recognition on each word
        seg = ner(entry)

        #find if any of the words pertain to a race
        for word in seg.ents:
            
            if word.label_ == "NORP":

                #if a word pertains to race, substitute in each entry of the bias elements
                for race in bias_elements:
                    newEntry = entry.replace(word.text, race)
                    subData.loc[len(subData.index)] = newEntry

                    #Do content downshift through summarization of new entry (abstractive summarization), and add
                    newEntrySum = sum.augment(newEntry)
                    subData.loc[len(subData.index)] = newEntrySum

                    #Do vocabulary upshift through augmentation, and add 
                    newEntryComp = aug.augment(newEntry)
                    subData.loc[len(subData.index)] = newEntryComp


    return subData

In [13]:
l = ['i think African-Americans people are stupid']
oE = pd.DataFrame(l)
oE.columns = ['report']

In [14]:
seg = ner(l[0])
for word in seg.ents:
    print(word.text, word.label_)

African-Americans NORP


In [None]:
data

NameError: name 'data' is not defined

This works on a small dataset in a timely fashion. Let's now try with the actual, large-scale dataset

In [13]:
actualData = datasetAugmentation(df)

In [None]:
actualData
actualData.to_csv()

In [None]:
actualData2 = datasetAugmentation(df)
actualData2

In [None]:
actualData2.to_csv()