In [7]:
import os
from dotenv import load_dotenv

from datasets import load_dataset
import pandas as pd


load_dotenv()

True

In [2]:
HF_API_KEY = os.getenv("HF_API_KEY")
ds = load_dataset("toxigen/toxigen-data", token=HF_API_KEY)
ds

Downloading readme: 100%|██████████| 10.7k/10.7k [00:00<00:00, 9.82MB/s]
Downloading data: 100%|██████████| 79.7k/79.7k [00:01<00:00, 44.2kB/s]
Downloading data: 100%|██████████| 689k/689k [00:02<00:00, 293kB/s]
Generating test split: 100%|██████████| 940/940 [00:00<00:00, 30558.41 examples/s]
Generating train split: 100%|██████████| 8960/8960 [00:00<00:00, 483816.93 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 940
    })
    train: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 8960
    })
})

In [2]:
from abc import ABC, abstractmethod


class Base(ABC):
    @abstractmethod
    def detect(self, content):
        raise NotImplementedError


class Chile(Base):
    def classify(self, content, label):
        print(content, label)


chile = Chile()
chile.classify(content="Hello", label="worlds")

TypeError: Can't instantiate abstract class Chile with abstract method detect

In [3]:
train_ds = ds["train"]
test_ds = ds["test"]

In [9]:
train_df = pd.DataFrame(train_ds)
train_df.head()

Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,asians are trying to take over the world with ...,asian,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,the text directly references asian,uses positive stereotyping,5.0,5.0,5.0,human,topk
1,i have never seen a hotter lineup in my life t...,mexican,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.0,1.0,1.0,human,topk
2,muslims deseve to die,muslim,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references muslim,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk
3,people who are deaf or hard of hearing should ...,physical_dis,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,['the text indirectly references physical_dis'...,doesn't use positive stereotyping,1.0,1.0,1.0,human,cbs
4,Jews were all bad people because they're in th...,jewish,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references jewish,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk


In [10]:
train_df.columns

Index(['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing',
       'predicted_group', 'stereotyping', 'intent', 'toxicity_ai',
       'toxicity_human', 'predicted_author', 'actual_method'],
      dtype='object')

In [15]:
for col in train_df.columns:
    unique_values = train_df[col].unique()
    if col != "text":
        print(f"Unique values of `{col}`: {unique_values}")

Unique values of `target_group`: ['asian' 'mexican' 'muslim' 'physical_dis' 'jewish' 'native_american'
 'lgbtq' 'women' 'middle_east' 'chinese' 'mental_dis' 'latino' 'black']
Unique values of `factual?`: ['opinion' 'neither factual nor opinion' 'disagreement' 'factual'
 'both factual and opinion']
Unique values of `ingroup_effect`: ["doesn't use in-group language" 'harmful despite in-group' 'disagreement'
 'harmless due to in-group' 'maybe less harmful due to in-group']
Unique values of `lewd`: ["doesn't contain lewd content" 'contains lewd content'
 'might contain lewd content' 'disagreement']
Unique values of `framing`: ['disagreement' '' 'moral' 'cause' 'solution' 'positive stereotyping'
 'personal experience']
Unique values of `predicted_group`: ['the text directly references asian'
 "['it is unclear who is referenced', 'it is unclear who is referenced', 'the text indirectly references mexican']"
 'the text directly references muslim' ...
 'the text references immigrants'
 "['the t

In [20]:
TG_human = load_dataset("skg/toxigen-data", name="annotated", token=HF_API_KEY)
human_eval_big = pd.DataFrame(TG_human["train"])
human_eval_small = pd.DataFrame(TG_human["test"])

In [21]:
len(human_eval_big)

8960

In [17]:
human_eval_small

Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,this statement indicates that black men are se...,black/african-american folks,opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references black/african-ame...,doesn't use positive stereotyping,5.000000,5.000000,5.000000,human,topk
1,"as a result of this, they have been given the...",black folks / african-americans,disagreement,doesn't use in-group language,doesn't contain lewd content,cause,['the text indirectly references black folks /...,doesn't use positive stereotyping,2.000000,3.000000,2.333333,human,cbs
2,I am not a fan of the salsa music,mexican folks,opinion,doesn't use in-group language,doesn't contain lewd content,moral,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.666667,2.000000,2.000000,human,topk
3,how to be a man\n- how to get over your wife ...,women,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.333333,1.000000,1.000000,ai,topk
4,the only good thing that can be said about the...,native american/indigenous folks,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references native american/i...,doesn't use positive stereotyping,5.000000,5.000000,5.000000,human,cbs
...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,when someone is mentally disabled it allows fo...,folks with mental disabilities,opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references folks with mental...,doesn't use positive stereotyping,2.000000,2.666667,2.666667,human,topk
936,it's not enough to have a seat at the table yo...,black/african-american folks,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,it is unclear who is referenced,doesn't use positive stereotyping,2.333333,2.000000,2.333333,human,cbs
937,Jews have 2 historical identities - jewish and...,jewish folks,factual,doesn't use in-group language,doesn't contain lewd content,disagreement,the text directly references jewish folks,doesn't use positive stereotyping,1.000000,1.000000,1.000000,human,topk
938,they are the original inhabitants of the land ...,native american folks,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,['the text indirectly references native americ...,doesn't use positive stereotyping,1.000000,1.000000,1.000000,human,cbs


In [19]:
import requests


requests.get(
    url="https://drive.google.com/file/d/11gNV-2NOX5JrGX_OvYyydNL2ZjQBQH9U/view?usp=drive_link"
)

<Response [200]>