In [2]:
!pip install SAGEDbias==0.0.11

from saged import SAGEDData, SourceFinder, Scraper
import pandas as pd
import re
from tqdm import tqdm

domains_and_categories = {
    "nationalities": ["British people", "American people", "Indian people"],
    "religion": ["Christianity", "Islam", "Hinduism", "Atheism"],
    "gender": ["Male", "Female", "Non-binary", "Transgender"],
    "lgbtq": ["Gay", "Lesbian", "Bisexual", "Transgender"],
    "languages": ["English speakers", "Spanish speakers", "Mandarin speakers"],
    "age_groups": ["Children", "Teenagers", "Adults", "Seniors"]
}


rows = []


for domain, categories in tqdm(domains_and_categories.items(), desc="Processing Domains"):
    for category in categories:
        print(f"Processing: {domain} - {category}")

        try:

            keywords_data = SAGEDData.create_data(domain, category, "keywords")

            keywords_to_add = {
                "nationalities": ["Brit", "UK", "USA", "Indian"],
                "religion": ["Faith", "Belief", "God", "Spirituality"],
                "gender": ["He", "She", "They", "Identity"],
                "lgbtq": ["Pride", "Equality", "Love", "Freedom"],
                "languages": ["Speaking", "Fluent", "Native", "Learning"],
                "age_groups": ["Youth", "Adulting", "Mature", "Senior Citizen"],
            }.get(domain, [])

            for keyword in keywords_to_add:
                keywords_data.add(keyword=keyword)
            source_finder = SourceFinder(keywords_data)
            wiki_sources = source_finder.find_scrape_urls_on_wiki(top_n=2, scrape_backlinks=2)

            if not wiki_sources.data:
                print(f"No wiki sources found for category: {category}")
                continue

            scraper = Scraper(wiki_sources)
            scraper.scrape_in_page_for_wiki_with_buffer_files()
            scraped_sentences_data = scraper.scraped_sentence_to_saged_data()

            if not scraped_sentences_data.data:
                print(f"No data found for category: {category}")
                continue
            if 'keywords' not in scraped_sentences_data.data[0]:
                print(f"No keywords data for category: {category}")
                continue

            for keyword, keyword_data in scraped_sentences_data.data[0]['keywords'].items():
                if 'scraped_sentences' not in keyword_data:
                    print(f"No scraped sentences for keyword: {keyword}")
                    continue

                scraped_sentences = keyword_data.get('scraped_sentences', [])
                for sentence, _ in scraped_sentences:
                    stereotype_type = domain.capitalize()
                    category_label = category
                    label = "Biased" if domain in ["nationalities", "gender", "lgbtq", "age_groups"] else "Neutral"

                    marked_text = sentence
                    for word in keywords_to_add:
                        marked_text = re.sub(r'\b' + re.escape(word) + r'\b', f'==={word}===', marked_text, flags=re.IGNORECASE)

                    rows.append({
                        'stereotype_type': stereotype_type,
                        'text': sentence,
                        'text_with_marker': marked_text,
                        'category': category_label,
                        'data_source': 'scraped_wikipedia',
                        'label': label
                    })

        except Exception as e:
            print(f"Error processing {domain} - {category}: {e}")
            continue

df = pd.DataFrame(rows)
output_file = "scraped_bias_dataset.csv"
df.to_csv(output_file, index=False)
print(f"Scraped and processed data saved to '{output_file}'")




Processing Domains:   0%|          | 0/6 [00:00<?, ?it/s]

Processing: nationalities - British people
Searching Wikipedia for topic: British people
Found Wikipedia page: British people
Searching similar forelinks for British people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.52it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.93it/s]


Searching similar backlinks for British people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.11it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.00it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.04keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.16keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.21keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.05keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:03<00:07,  3.81s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  3.10keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.97keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  3.02keyword/s][A[A

Scraping in page: 100%|██████

Processing: nationalities - American people
Searching Wikipedia for topic: American people
Found Wikipedia page: Americans
Searching similar forelinks for American people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.48it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.03it/s]


Searching similar backlinks for American people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.43it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  5.50it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  6.63keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  6.39keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  6.33keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  6.30keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:00<00:01,  1.56url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.24keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.05s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.06keyword/s][A[A

Scraping in page: 100%|██████

Error processing nationalities - American people: 'scraped_sentences'
Processing: nationalities - Indian people
Searching Wikipedia for topic: Indian people
Found Wikipedia page: Indian people
Searching similar forelinks for Indian people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.46it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.56it/s]


Searching similar backlinks for Indian people



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.04it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.23it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:05,  1.71s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:03<00:02,  1.48s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:04<00:01,  1.33s/keyword][A[A

Scraping in page: 100%|██████████| 4/4 [00:05<00:00,  1.39s/keyword]

Scraping through URL:  33%|███▎      | 1/3 [00:05<00:11,  5.57s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.11keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.11keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:01,  1.13s/keyword][A[A

Scraping in page: 100%|██████

Error processing nationalities - Indian people: 'scraped_sentences'
Processing: religion - Christianity
Searching Wikipedia for topic: Christianity
Found Wikipedia page: Christianity
Searching similar forelinks for Christianity



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.02it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.80it/s]


Searching similar backlinks for Christianity



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.13keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.02keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  3.93keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:01<00:00,  3.92keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:01<00:02,  1.03s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:05,  1.81s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:03<00:03,  1.56s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:05<00:01,  1.71s/keyword][A[A

Scraping in page: 100%|██████

Processing: religion - Islam
Searching Wikipedia for topic: Islam
Found Wikipedia page: Islam
Searching similar forelinks for Islam



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  5.28it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.56it/s]


Searching similar backlinks for Islam



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  2.11it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  3.00keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.90keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.89keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:01<00:00,  2.86keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:01<00:02,  1.41s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:03,  1.20s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.20s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:04<00:01,  1.43s/keyword][A[A

Scraping in page: 100%|██████

Processing: religion - Hinduism
Searching Wikipedia for topic: Hinduism
Found Wikipedia page: Hinduism
Searching similar forelinks for Hinduism



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.45it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.04it/s]


Searching similar backlinks for Hinduism



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  1.67it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  2.66it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:05,  1.90s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:03<00:03,  1.66s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:05<00:01,  1.81s/keyword][A[A

Scraping in page: 100%|██████████| 4/4 [00:06<00:00,  1.72s/keyword]

Scraping through URL:  33%|███▎      | 1/3 [00:06<00:13,  6.88s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.32keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.24keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.17keyword/s][A[A

Scraping in page: 100%|██████

Processing: religion - Atheism
Searching Wikipedia for topic: Atheism
Found Wikipedia page: Atheism
Searching similar forelinks for Atheism



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  5.43it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  5.72it/s]


Searching similar backlinks for Atheism



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  9.47it/s]

Scraping through URL:   0%|          | 0/5 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.63keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.47keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.15keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  4.21keyword/s]

Scraping through URL:  20%|██        | 1/5 [00:00<00:03,  1.04url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.84keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.95keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.99keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  4.92keyword/s]

Scraping through UR

Processing: gender - Male
Searching Wikipedia for topic: Male
Found Wikipedia page: Male
Searching similar forelinks for Male



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  6.80it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.35it/s]


Searching similar backlinks for Male



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.77it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.22it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.38keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.32keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.34keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:01<00:00,  2.34keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:01<00:03,  1.72s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.59keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.56keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.53keyword/s][A[A

Scraping in page: 100%|██████

Processing: gender - Female
Searching Wikipedia for topic: Female
Found Wikipedia page: Female
Searching similar forelinks for Female



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  5.81it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  5.97it/s]


Searching similar backlinks for Female



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  3.55it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  4.72it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.37keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:02,  1.00s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.14keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.07keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:03<00:07,  3.75s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  3.76keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  3.47keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  3.58keyword/s][A[A

Scraping in page: 100%|██████

Processing: gender - Non-binary
Searching Wikipedia for topic: Non-binary
Found Wikipedia page: Non-binary gender
Searching similar forelinks for Non-binary



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.82it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  9.21it/s]


Searching similar backlinks for Non-binary



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  5.94it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.00it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:04,  1.39s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.10s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:01,  1.02s/keyword][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.01keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:03<00:07,  3.98s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  1.59keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.11keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.29keyword/s][A[A

Scraping in page: 100%|██████

Processing: gender - Transgender
Searching Wikipedia for topic: Transgender
Found Wikipedia page: Transgender
Searching similar forelinks for Transgender



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  3.48it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  4.80it/s]


Searching similar backlinks for Transgender



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  2.72it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  2.61it/s]

Scraping through URL:   0%|          | 0/2 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:03,  1.09s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.14s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:01,  1.11s/keyword][A[A

Scraping in page: 100%|██████████| 4/4 [00:04<00:00,  1.15s/keyword]

Scraping through URL:  50%|█████     | 1/2 [00:04<00:04,  4.61s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.24keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.23keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.20keyword/s][A[A

Scraping in page: 100%|██████

Processing: lgbtq - Gay
Searching Wikipedia for topic: Gay
Found Wikipedia page: Gay
Searching similar forelinks for Gay



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  3.83it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  4.69it/s]


Searching similar backlinks for Gay



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.27it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  5.60it/s]

Scraping through URL:   0%|          | 0/4 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:04,  1.38s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.08s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:00,  1.01keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.00keyword/s]

Scraping through URL:  25%|██▌       | 1/4 [00:04<00:12,  4.00s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.08keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.58keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  1.85keyword/s][A[A

Scraping in page: 100%|██████

Processing: lgbtq - Lesbian
Searching Wikipedia for topic: Lesbian
Found Wikipedia page: Lesbian
Searching similar forelinks for Lesbian



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.62it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.67it/s]


Searching similar backlinks for Lesbian



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  4.66it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  4.17it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.41keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.23keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.22keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  4.21keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:00<00:01,  1.05url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:03,  1.24s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.06keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.18keyword/s][A[A

Scraping in page: 100%|██████

Processing: lgbtq - Bisexual
Searching Wikipedia for topic: Bisexual
Found Wikipedia page: Bisexuality
Searching similar forelinks for Bisexual



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.34it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.11it/s]


Searching similar backlinks for Bisexual



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  3.26it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  3.88it/s]

Scraping through URL:   0%|          | 0/2 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  3.05keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  3.11keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  3.20keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:01<00:00,  3.18keyword/s]

Scraping through URL:  50%|█████     | 1/2 [00:01<00:01,  1.27s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.47keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.46keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.12keyword/s][A[A

Scraping in page: 100%|██████

Processing: lgbtq - Transgender
Searching Wikipedia for topic: Transgender
Found Wikipedia page: Transgender
Searching similar forelinks for Transgender



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.12it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.65it/s]


Searching similar backlinks for Transgender



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  3.83it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  3.01it/s]

Scraping through URL:   0%|          | 0/2 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.20keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.13s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:00,  1.00keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.04keyword/s]

Scraping through URL:  50%|█████     | 1/2 [00:03<00:03,  3.85s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.24keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.16keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.15keyword/s][A[A

Scraping in page: 100%|██████

Processing: languages - English speakers
Searching Wikipedia for topic: English speakers
Found Wikipedia page: English language
Searching similar forelinks for English speakers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  6.37it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  5.68it/s]


Searching similar backlinks for English speakers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  1.43it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:03,  1.07s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.29s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:01,  1.18s/keyword][A[A

Scraping in page: 100%|██████████| 4/4 [00:05<00:00,  1.28s/keyword]

Scraping through URL:  33%|███▎      | 1/3 [00:05<00:10,  5.12s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  1.79keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.71keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  1.72keyword/s][A[A

Scraping in page: 100%|██████

Processing: languages - Spanish speakers
Searching Wikipedia for topic: Spanish speakers
Found Wikipedia page: Hispanophone
Searching similar forelinks for Spanish speakers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  6.71it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.29it/s]


Searching similar backlinks for Spanish speakers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.42it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.17it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  1.98keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:01<00:01,  1.17keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:02<00:00,  1.44keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:02<00:00,  1.53keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:02<00:05,  2.62s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:02,  1.00keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.26s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:01,  1.18s/keyword][A[A

Scraping in page: 100%|██████

Processing: languages - Mandarin speakers
Searching Wikipedia for topic: Mandarin speakers
Error processing languages - Mandarin speakers: too many values to unpack (expected 2)
Processing: age_groups - Children
Searching Wikipedia for topic: Children
Found Wikipedia page: Child
Searching similar forelinks for Children



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.63it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.87it/s]


Searching similar backlinks for Children



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  4.18it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  4.01it/s]

Scraping through URL:   0%|          | 0/2 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.13keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.09keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.11keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:01<00:00,  2.03keyword/s]

Scraping through URL:  50%|█████     | 1/2 [00:01<00:01,  1.98s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  5.43keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  5.44keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  5.22keyword/s][A[A

Scraping in page: 100%|██████

Processing: age_groups - Teenagers
Searching Wikipedia for topic: Teenagers
Found Wikipedia page: Adolescence
Searching similar forelinks for Teenagers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.65it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.22it/s]


Searching similar backlinks for Teenagers



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  7.74it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.31it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:01<00:03,  1.22s/keyword][A[A

Scraping in page:  50%|█████     | 2/4 [00:02<00:02,  1.03s/keyword][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:03<00:00,  1.02keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:03<00:00,  1.04keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:03<00:07,  3.85s/url][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  6.02keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  6.19keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  6.22keyword/s][A[A

Scraping in page: 100%|██████

Processing: age_groups - Adults
Searching Wikipedia for topic: Adults
Found Wikipedia page: Adult
Searching similar forelinks for Adults



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  6.74it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.82it/s]


Searching similar backlinks for Adults



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.51it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  6.29it/s]

Scraping through URL:   0%|          | 0/2 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.13keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.10keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.10keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  4.07keyword/s]

Scraping through URL:  50%|█████     | 1/2 [00:00<00:00,  1.01url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:01,  2.91keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  2.94keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:01<00:00,  2.90keyword/s][A[A

Scraping in page: 100%|██████

Processing: age_groups - Seniors
Searching Wikipedia for topic: Seniors
Found Wikipedia page: Senior
Searching similar forelinks for Seniors



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  9.40it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  7.90it/s]


Searching similar backlinks for Seniors



Depth 1/1:   0%|          | 0/2 [00:00<?, ?it/s][A
Depth 1/1:  50%|█████     | 1/2 [00:00<00:00,  8.81it/s][A
Depth 1/1: 100%|██████████| 2/2 [00:00<00:00,  8.51it/s]

Scraping through URL:   0%|          | 0/3 [00:00<?, ?url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  3.24keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.21keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.66keyword/s][A[A

Scraping in page: 100%|██████████| 4/4 [00:00<00:00,  4.55keyword/s]

Scraping through URL:  33%|███▎      | 1/3 [00:00<00:01,  1.12url/s][A

Scraping in page:   0%|          | 0/4 [00:00<?, ?keyword/s][A[A

Scraping in page:  25%|██▌       | 1/4 [00:00<00:00,  4.28keyword/s][A[A

Scraping in page:  50%|█████     | 2/4 [00:00<00:00,  4.19keyword/s][A[A

Scraping in page:  75%|███████▌  | 3/4 [00:00<00:00,  4.19keyword/s][A[A

Scraping in page: 100%|██████

Error processing age_groups - Seniors: 'scraped_sentences'
Scraped and processed data saved to 'scraped_bias_dataset.csv'





In [3]:
!pip install SAGEDbias==0.0.11



In [2]:
!pip install SAGEDbias==0.0.11
!pip install transformers
!pip install docling
!pip install ollama

import os
import pandas as pd
from saged import KeywordFinder, SourceFinder, Scraper, SAGEDData
from transformers import pipeline
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import time
from functools import wraps

# Step 1: Define race-related domains and categories
domains_and_categories = {
    "race_ethnicity": [
        "White people",
        "Black people",
        "Asian people",
        "Hispanic people",
        "Native American people",
        "Middle Eastern people",
        "African people",
        "European people",
        "Mixed race people"
    ]
}

# Race list for generating and highlighting keywords
race_list = [
    "White", "Black", "Asian", "Hispanic",
    "Native American", "Middle Eastern",
    "African", "European", "Mixed"
]

# Step 2: Define LLM-based keyword generation class
class HuggingFaceChatPipeline:
    def __init__(self, model_name="Qwen/Qwen2.5-1.5B-Instruct"):
        self.chat_pipeline = pipeline(
            "text-generation",
            model=model_name,
            tokenizer=model_name,
            device_map="auto",
            torch_dtype="auto"
        )

    def invoke(self, user_prompt, system_prompt="You are a helpful assistant."):
        prompt = f"{system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"
        response = self.chat_pipeline(
            prompt,
            max_length=512,
            num_return_sequences=1,
            pad_token_id=self.chat_pipeline.tokenizer.eos_token_id
        )[0]["generated_text"]
        response_cleaned = response.replace(prompt, "").strip()
        return response_cleaned

# Step 3: Retry Logic for Resilient Scraping
def retry(max_attempts, delay):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if attempt < max_attempts - 1:
                        time.sleep(delay)
                    else:
                        raise e
        return wrapper
    return decorator

@retry(max_attempts=3, delay=5)
def safe_scrape(scraper):
    scraper.scrape_in_page_for_wiki_with_buffer_files()

# Step 4: Processing Function
def process_category(category, domain, race_list, model):
    rows = []
    try:
        keyword_finder = KeywordFinder(category, domain)

        # Generate Keywords
        keyword_finder.find_keywords_by_embedding_on_wiki(n_keywords=3)
        keywords_data_embeddings = keyword_finder.keywords_to_saged_data()

        keyword_finder.find_keywords_by_llm_inquiries(
            generation_function=model.invoke, n_keywords=3, n_run=2
        )
        keywords_data_llm = keyword_finder.keywords_to_saged_data()

        # Combine keywords
        keywords_data = keywords_data_embeddings.merge(merge_list=[keywords_data_llm])

        # Scrape Sources
        source_finder = SourceFinder(keywords_data)
        wiki_sources = source_finder.find_scrape_urls_on_wiki(top_n=2, scrape_backlinks=0)

        if not wiki_sources.data:
            print(f"No wiki sources found for category: {category}")
            return rows

        scraper = Scraper(wiki_sources)
        safe_scrape(scraper)
        scraped_sentences_data = scraper.scraped_sentence_to_saged_data()

        # Process Scraped Sentences
        for keyword, keyword_data in scraped_sentences_data.data[0]["keywords"].items():
            scraped_sentences = keyword_data.get("scraped_sentences", [])
            for sentence, _ in scraped_sentences:
                marked_text = sentence
                for word in race_list:
                    marked_text = re.sub(rf"\b{re.escape(word)}\b", f"==={word}===", marked_text, flags=re.IGNORECASE)

                rows.append({
                    "stereotype_type": domain.capitalize(),
                    "text": sentence,
                    "text_with_marker": marked_text,
                    "category": category,
                    "data_source": "scraped_wikipedia",
                    "label": "Biased"
                })
    except Exception as e:
        print(f"Error processing category {category}: {e}")
    return rows

# Step 5: Main Script with Parallel Processing
def main():
    model = HuggingFaceChatPipeline(model_name="Qwen/Qwen2.5-1.5B-Instruct")
    all_rows = []

    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your CPU
        futures = [
            executor.submit(process_category, category, domain, race_list, model)
            for domain, categories in domains_and_categories.items()
            for category in categories
        ]
        for future in tqdm(futures, desc="Processing categories"):
            all_rows.extend(future.result())

    # Save results to CSV
    output_file = "optimized_race_bias_dataset.csv"
    df = pd.DataFrame(all_rows)
    df.to_csv(output_file, index=False)
    print(f"Dataset saved to '{output_file}'.")

if __name__ == "__main__":
    main()


Initiating the embedding model...Initiating the embedding model...
Initiating the embedding model...

Initiating the embedding model...


Processing categories:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/89 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/404 [00:00<?, ?it/s][A
Calculating similarities:  33%|███▎      | 133/404 [00:00<00:00, 1329.92it/s][A
Calculating similarities:  66%|██████▌   | 266/404 [00:00<00:00, 1229.23it/s][A
Calculating similarities: 100%|██████████| 404/404 [00:00<00:00, 1116.26it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/1284 [00:00<?, ?it/s][A
Calculating similarities:  11%|█         | 136/1284 [00:00<00:00, 1349.96it/s][A
Calculating similarities:  21%|██        | 271/1284 [00:00<00:00, 1112.88it/s][A
Calculating similarities:  34%|███▍      | 434/1284 [00:00<00:00, 1321.33it/s][A
Calculating similarities:  44%|████▍     | 570/1284 [00:00<00:00, 1285.56it/s][A
Calculating similarities:  55%|█████▍    | 701/1284 [00:00<00:00, 1273.28it/s][A
Calculating similarities:  65%|██████▍   | 830/1284 [00:00<00:00, 1203.63it/s][A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:  74%|███████▍  | 952/1284 [00:00<00:00, 1128.77it/s][A

Calculating similarities:   0%|          | 0/2043 [00:00<?, ?it/s][A[A
Calculating similarities:  84%|████████▍ | 1079/1284 [00:00<00:00, 1157.47it/s][A

Calculating similarities:   6%|▌         | 122/2043 [00:00<00:01, 1214.36it/s][A[A
Calculating similarities:  93%|█████████▎| 1196/1284 [00:01<00:00, 1062.23it/s][A

Calculating similarities: 100%|██████████| 1284/1284 [00:01<00:00, 1152.54it/s]


Calculating similarities:  18%|█▊        | 359/2043 [00:00<00:01, 1086.71it/s][A[A

Calculating similarities:  23%|██▎       | 477/2043 [00:00<00:01, 1116.60it/s][A[A

Calculating similarities:  29%|██▉       | 589/2043 [00:00<00:01, 1102.52it/s][A[A

Calculating similarities:  34%|███▍      | 703/2043 [00:00<00:01, 1107.73it/s][A[A

Calculating similarities:  41%|████      | 839/2043 [00:00<00:01, 1181.82it/s][A[A

Calculating similarities:  47%|████▋     | 960/2043 [00:00<00:00, 1190.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/2830 [00:00<?, ?it/s][A
Calculating similarities:   9%|▉         | 250/2830 [00:00<00:01, 2360.64it/s][A
Calculating similarities:  17%|█▋        | 487/2830 [00:00<00:01, 2326.45it/s][A
Calculating similarities:  25%|██▌       | 720/2830 [00:00<00:01, 2064.30it/s][A
Calculating similarities:  33%|███▎      | 930/2830 [00:00<00:00, 1925.61it/s][A
Calculating similarities:  40%|███▉      | 1128/2830 [00:00<00:00, 1933.12it/s][A
Calculating similarities:  47%|████▋     | 1323/2830 [00:00<00:00, 1628.48it/s][A
Calculating similarities:  53%|█████▎    | 1493/2830 [00:00<00:00, 1490.45it/s][A
Calculating similarities:  59%|█████▉    | 1669/2830 [00:00<00:00, 1490.36it/s][A
Calculating similarities:  66%|██████▋   | 1878/2830 [00:01<00:00, 1647.62it/s][A
Calculating similarities:  72%|███████▏  | 2050/2830 [00:01<00:00, 1649.66it/s][A
Calculating similarities:  80%|███████▉  | 2250/2830 [00:01<00:00, 1710.28it/s][A
Calculating similari

Response: ['Caucasian', 'European', 'Saxon', 'German', 'British', 'Irish', 'Scandinavian']




finding keywords by LLM:  50%|█████     | 1/2 [00:51<00:51, 51.10s/run][A[A

Response: ['Mexican Americans', 'Puerto Ricans', 'Cuban Americans', 'Dominicans', 'Guatemalans', 'Haitians', 'Peruvians', 'Nicaraguans', 'Colombians']



finding keywords by LLM: 100%|██████████| 2/2 [01:41<00:00, 50.61s/run]

Response: ['White', 'European ancestry', 'Caucasian', 'Semi-civilized']
final set of keywords:
['British', 'Celtic', 'Irish', 'Latinate', 'Teutonic', 'Eastern', 'Continental', 'Northern', 'European', 'German', 'Southern', 'Mediterranean', 'Semi-civilized', 'Western', 'Caucasian', 'Scandinavian', 'Atlantic', 'Saxon', 'Nordic', 'white']



Processing categories:  11%|█         | 1/9 [02:24<19:13, 144.19s/it]

Error processing category White people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Initiating the embedding model...


Batches:   0%|          | 0/124 [00:00<?, ?it/s]





finding keywords by LLM:  50%|█████     | 1/2 [01:04<01:04, 64.19s/run][A[A[A[A

Response: ['Chinese', 'Indian', 'Korean', 'Japanese', 'Thai', 'Vietnamese', 'Filipino', 'Indonesian', 'Laotian', 'Malaysian']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/3943 [00:00<?, ?it/s][A
Calculating similarities:   4%|▍         | 173/3943 [00:00<00:02, 1650.05it/s][A
Calculating similarities:   9%|▉         | 346/3943 [00:00<00:02, 1562.01it/s][A
Calculating similarities:  14%|█▍        | 553/3943 [00:00<00:02, 1594.38it/s][A
Calculating similarities:  19%|█▉        | 745/3943 [00:00<00:01, 1709.54it/s][A
Calculating similarities:  24%|██▍       | 952/3943 [00:00<00:01, 1802.97it/s][A
Calculating similarities:  29%|██▉       | 1151/3943 [00:00<00:01, 1855.45it/s][A
Calculating similarities:  34%|███▍      | 1348/3943 [00:00<00:01, 1873.74it/s][A
Calculating similarities:  39%|███▉      | 1537/3943 [00:00<00:01, 1856.17it/s][A
Calculating similarities:  44%|████▍     | 1739/3943 [00:00<00:01, 1834.02it/s][A
Calculating similarities:  50%|████▉     | 1959/3943 [00:01<00:01, 1874.87it/s][A
Calculating similarities:  55%|█████▌    | 2180/3943 [00:01<00:00, 1938.45it/s][A
Calculating similarit

Response: ['African Americans', 'Caribbean Blacks', 'West Indians', 'Black North Africans', 'Afro-Latinos', 'Sub-Saharan African Diaspora']






finding keywords by LLM: 100%|██████████| 2/2 [01:43<00:00, 51.54s/run]

Response: ['Asian heritage', 'East or Southeast Asian origins', 'Distinctive physical features such as light skin tone, small eyes, and slightly pointed ears', 'High level of intelligence and academic achievement', 'Strong work ethic and dedication to success']
final set of keywords:
['Korean', 'Malaysian', 'ethnography', 'Thai', 'Laotian', 'Indonesian', 'Filipino', 'African', 'Chinese', 'Vietnamese', 'Pacific Islander', 'ethnic', 'Asian', 'Indian', 'Strong work ethic and dedication to success', 'Japanese', 'High level of intelligence and academic achievement', 'Distinctive physical features such as light skin tone, small eyes, and slightly pointed ears']





Error processing category Asian people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Initiating the embedding model...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]



finding keywords by LLM: 100%|██████████| 2/2 [02:18<00:00, 69.19s/run]

Invocation failed at iteration 1: invalid syntax (<string>, line 0)
final set of keywords:
['Mexican', 'Cuban', 'Latino', 'Nicaraguans', 'Colombians', 'Puerto Ricans', 'Peruvians', 'Hispanic', 'Guatemalans', 'Haitians', 'Dominicans']





Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/246 [00:00<?, ?it/s][A
Calculating similarities: 100%|██████████| 246/246 [00:00<00:00, 1693.11it/s]


Error processing category Hispanic people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Initiating the embedding model...


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/981 [00:00<?, ?it/s][A
Calculating similarities:  12%|█▏        | 113/981 [00:00<00:00, 1114.40it/s][A
Calculating similarities:  28%|██▊       | 274/981 [00:00<00:00, 1401.12it/s][A
Calculating similarities:  42%|████▏     | 415/981 [00:00<00:00, 1399.14it/s][A
Calculating similarities:  57%|█████▋    | 555/981 [00:00<00:00, 1379.25it/s][A
Calculating similarities:  71%|███████   | 695/981 [00:00<00:00, 1374.62it/s][A
Calculating similarities:  85%|████████▍ | 833/981 [00:00<00:00, 1284.87it/s][A
Calculating similarities: 100%|██████████| 981/981 [00:00<00:00, 1322.32it/s]

finding keywords by LLM:   0%|          | 0/2 [00:00<?, ?run/s][A


finding keywords by LLM: 100%|██████████| 2/2 [02:32<00:00, 76.26s/run]

Invocation failed at iteration 1: invalid syntax (<string>, line 0)
final set of keywords:
['Zulu', 'Afro-American', 'Mandinka', 'Yoruba', 'African', 'West Indians', 'Afro-Latinos', 'Akan', 'Negro', 'Black', 'Hausa']



Processing categories:  22%|██▏       | 2/9 [03:44<12:26, 106.58s/it]

Error processing category Black people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Initiating the embedding model...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Calculating similarities: 100%|██████████| 19/19 [00:00<00:00, 938.27it/s]

finding keywords by LLM:  50%|█████     | 1/2 [00:36<00:36, 36.77s/run][A

Response: ['Algonquian', 'Athabaskan', 'Iroquoian', 'Siouan', 'Apache', 'Cree', 'Dakota', 'Navajo', 'Ojibwe', 'Mohawk', 'Pawnee', 'Sarsi']




finding keywords by LLM:   0%|          | 0/2 [00:00<?, ?run/s][A[A


finding keywords by LLM:   0%|          | 0/2 [00:00<?, ?run/s][A[A[A
finding keywords by LLM: 100%|██████████| 2/2 [01:08<00:00, 34.22s/run]

Response: ['Indigenous peoples', 'Native American heritage', 'Historical tribes or nations', 'Traditional cultures and languages']
final set of keywords:
['Iroquoian', 'Sarsi', 'Pawnee', 'Historical tribes or nations', 'Algonquian', 'ethnicity', 'Ojibwe', 'Cree', 'Apache', 'Dakota', 'Traditional cultures and languages', 'Siouan', 'Indian', 'Navajo', 'Indigenous', 'American', 'Mohawk', 'Athabaskan']



Processing categories:  56%|█████▌    | 5/9 [04:27<02:42, 40.53s/it] 

Error processing category Native American people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Initiating the embedding model...


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Calculating similarities:   0%|          | 0/1330 [00:00<?, ?it/s][A
Calculating similarities:  10%|█         | 136/1330 [00:00<00:00, 1335.43it/s][A
Calculating similarities:  22%|██▏       | 291/1330 [00:00<00:00, 1449.04it/s][A
Calculating similarities:  38%|███▊      | 506/1330 [00:00<00:00, 1726.75it/s][A
Calculating similarities:  51%|█████     | 679/1330 [00:00<00:00, 1708.61it/s][A
Calculating similarities:  65%|██████▍   | 861/1330 [00:00<00:00, 1735.47it/s][A
Calculating similarities:  82%|████████▏ | 1089/1330 [00:00<00:00, 1829.37it/s][A
Calculating similarities: 100%|██████████| 1330/1330 [00:00<00:00, 1708.78it/s]



finding keywords by LLM:  50%|█████     | 1/2 [00:38<00:38, 38.28s/run][A[A[A

Invocation failed at iteration 0: unterminated string literal (detected at line 6) (<string>, line 6)




finding keywords by LLM:  50%|█████     | 1/2 [00:44<00:44, 44.91s/run][A[A

Response: ['African Americans', 'Angolan', 'Beninese', 'Brazilians', 'Cameroonian', 'Congolese', 'Djiboutian', 'Eritrean', 'Ghanaian', 'Guinean', 'Haitian', 'Ivorian', 'Jamaican', 'Kenyan', 'Lesotho', 'Malagasy', 'Mali', 'Mauritanian', 'Moroccan', 'Namibian', 'Nigerian', 'Papua New Guinean', 'Rwandan', 'Senegalese', 'Sierra Leone', 'Somali', 'South African', 'Tanzanian', 'Ugandan', 'Zambian']



finding keywords by LLM:   0%|          | 0/2 [00:00<?, ?run/s][A


finding keywords by LLM: 100%|██████████| 2/2 [01:04<00:00, 32.09s/run]

Response: ['Religious diversity', 'Distinct cultural heritage', 'Historical connections with neighboring countries', 'Multilingualism', 'Culinary traditions', 'Folk art and crafts', 'Traditional clothing styles', 'Geographical location']
final set of keywords:
['Persian', 'Culinary traditions', 'Folk art and crafts', 'Turkish', 'African', 'Religious diversity', 'Distinct cultural heritage', 'Arabian', 'Traditional clothing styles', 'Geographical location', 'Historical connections with neighboring countries', 'Multilingualism', 'Middle Eastern people']



Processing categories:  67%|██████▋   | 6/9 [05:03<01:58, 39.35s/it]

Error processing category Middle Eastern people: SAGEDData.merge() missing 1 required positional argument: 'domain'



finding keywords by LLM:  50%|█████     | 1/2 [00:23<00:23, 23.39s/run][A

Response: ['Caucasians', 'Saxons', 'Romani', 'Gypsies', 'Indo-Europeans', 'Slavs', 'Russians', 'Poles', 'Czechs', 'Slovaks']





finding keywords by LLM:   0%|          | 0/2 [00:00<?, ?run/s][A[A[A

finding keywords by LLM: 100%|██████████| 2/2 [01:31<00:00, 45.83s/run]

Response: ['Black skin color', 'Sub-Saharan origin', 'Diverse cultural heritage', 'High resilience and adaptability', 'Strong community ties', 'Significant influence on global music', 'Unique language groups such as Yoruba, Hausa, Swahili']
final set of keywords:
['Namibian', 'Eritrean', 'Diverse cultural heritage', 'Ethiopian', 'Ghanaian', 'Brazilians', 'High resilience and adaptability', 'Unique language groups such as Yoruba, Hausa, Swahili', 'Strong community ties', 'Tanzanian', 'Mali', 'Jamaican', 'Zambian', 'Mauritanian', 'Lesotho', 'Black skin color', 'African', 'Ugandan', 'Sub-Saharan origin', 'Angolan', 'Senegalese', 'Moroccan', 'Guinean', 'Malagasy', 'Ivorian', 'Kenyan', 'Sierra Leone', 'Djiboutian', 'Cameroonian', 'Haitian', 'Beninese', 'Significant influence on global music', 'ethnics', 'Rwandan', 'Nigerian', 'Congolese']



Processing categories:  78%|███████▊  | 7/9 [05:27<01:10, 35.29s/it]

Error processing category African people: SAGEDData.merge() missing 1 required positional argument: 'domain'





finding keywords by LLM:  50%|█████     | 1/2 [00:25<00:25, 25.53s/run][A[A[A

Response: ['Multiracial', 'Biracial', 'Heteroracial', 'X-race', 'Transracial', 'Tri-racial', 'Quadri-racial', 'Quinti-racial', 'Septi-racial', 'Octo-racial']



finding keywords by LLM: 100%|██████████| 2/2 [01:00<00:00, 30.04s/run]

Response: ['European ancestry', 'High level of education', 'Multilingualism', 'Strong work ethic', 'Respect for law and order', 'Cultural diversity appreciation', 'Adaptability to new environments', 'Historical knowledge', 'Financial stability']
final set of keywords:
['Financial stability', 'Strong work ethic', 'Historical knowledge', 'ethnic', 'Cultural diversity appreciation', 'Saxons', 'Czechs', 'Slavs', 'High level of education', 'European', 'Respect for law and order', 'Romani', 'racial', 'Russians', 'people', 'Multilingualism', 'Slovaks', 'Adaptability to new environments', 'Caucasians', 'Poles', 'Gypsies']



Processing categories:  89%|████████▉ | 8/9 [05:51<00:32, 32.20s/it]

Error processing category European people: SAGEDData.merge() missing 1 required positional argument: 'domain'





finding keywords by LLM: 100%|██████████| 2/2 [00:42<00:00, 21.31s/run]

Response: ['Mixed heritage', 'Multiple racial identities', 'Inter-marriage or cross-cultural relationships', 'Cultural blending and syncretism', 'Complex family histories', 'Challenges related to identity formation', 'Opportunities for cultural exchange and understanding', 'Intersectionality within social justice contexts']
final set of keywords:
['Octo-racial', 'Inter-marriage or cross-cultural relationships', 'Quadri-racial', 'ethnic', 'Biracial', 'Cultural blending and syncretism', 'Tri-racial', 'Multiple racial identities', 'Challenges related to identity formation', 'Multiracial', 'Transracial', 'Septi-racial', 'Intersectionality within social justice contexts', 'Opportunities for cultural exchange and understanding', 'Quinti-racial', 'mixed', 'Complex family histories', 'Heteroracial', 'race']



Processing categories: 100%|██████████| 9/9 [06:02<00:00, 40.23s/it]

Error processing category Mixed race people: SAGEDData.merge() missing 1 required positional argument: 'domain'
Dataset saved to 'optimized_race_bias_dataset.csv'.



