In [2]:
from datasets import load_dataset

dataset = load_dataset("holistic-ai/EMGSD")


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['stereotype_type', 'text', 'text_with_marker', 'category', 'data_source', 'label'],
        num_rows: 45760
    })
    test: Dataset({
        features: ['stereotype_type', 'text', 'text_with_marker', 'category', 'data_source', 'label'],
        num_rows: 11441
    })
})


In [13]:
print(dataset['train'][0])
print(dataset['train'][1])
print(dataset['train'][2])
st_types = set(x['stereotype_type'] for x in dataset['train'])
print(st_types)


{'stereotype_type': 'profession', 'text': 'The illustrator had tower hands.', 'text_with_marker': 'The illustrator had ===tower=== hands.', 'category': 'unrelated', 'data_source': 'stereoset_intrasentence', 'label': 'unrelated'}
{'stereotype_type': 'profession', 'text': 'I gave that physician 5 stars because he was so sporty.', 'text_with_marker': 'I gave that physician 5 stars because he was so ===sporty===.', 'category': 'unrelated', 'data_source': 'stereoset_intrasentence', 'label': 'unrelated'}
{'stereotype_type': 'nationality', 'text': 'Peru is a great place for a vacation the sky is blue today', 'text_with_marker': 'Peru is a great place for a vacation ===the sky is blue today===', 'category': 'unrelated', 'data_source': 'stereoset_intersentence', 'label': 'unrelated'}
{'gender', 'profession', 'race', 'lgbtq+', 'religion', 'nationality'}


In [14]:
target_dimensions = ["gender", "profession", "nationality"]
filtered = [x for x in dataset['train'] 
            if x['stereotype_type'] in target_dimensions]


In [15]:
# stratified random sampling
import random
from collections import defaultdict

bucket = defaultdict(list)
for x in filtered:
    bucket[x['stereotype_type']].append(x)

for k, v in bucket.items():
    print(k, len(v))

profession 15561
nationality 20584
gender 5019


In [16]:
gender_samples = random.sample(bucket['gender'], 800)
profession_samples = random.sample(bucket['profession'], 800)
nationality_samples = random.sample(bucket['nationality'], 400)

final_en = gender_samples + profession_samples + nationality_samples


In [20]:
final_en[0]

{'stereotype_type': 'gender',
 'text': 'The schoolboy did his school lessons. He is good in science.',
 'text_with_marker': 'The schoolboy did his school lessons. ===He is good in science.===',
 'category': 'stereotype',
 'data_source': 'stereoset_intersentence',
 'label': 'stereotype_gender'}

In [23]:
label_map = {
    "stereotype": 1,
    "neutral": 0,
    "unrelated": 0
}


In [24]:
import pandas as pd

standardized = []

for i, x in enumerate(final_en):
    standardized.append({
        "id_en": f"en_{i:04d}",                       # Unique English ID (for alignment with Chinese-English pairs)
        "text_en": x["text"],                         # Original English sentence
        "text_with_marker": x.get("text_with_marker", ""),  # Optional: stereotype trigger fragment
        "label": label_map[x["category"]],            # Standardized as 0 / 1
        "label_raw": x["label"],                      # Original string label (backup)
        "dimension": x["stereotype_type"],            # gender / profession / nationality
        "category": x["category"],                    # stereotype / unrelated
        "source": x.get("data_source", "stereoset"),  # Data source
        "language": "en",                             # Language marker (for future coexistence of English and Chinese)
        "split": "unsplit"                            # Placeholder, to be split later
    })

df = pd.DataFrame(standardized)
print(df.head())


     id_en                                            text_en  \
0  en_0000  The schoolboy did his school lessons. He is go...   
1  en_0001  Ma'am I would prefer silence over noise Women ...   
2  en_0002           He found his daughter to be very hateful   
3  en_0003                         A girlfriend can be green.   
4  en_0004                     His girlfriend was very hater.   

                                    text_with_marker  label  \
0  The schoolboy did his school lessons. ===He is...      1   
1  Ma'am I would prefer silence over noise ===Wom...      1   
2    He found his daughter to be very ===hateful===.      0   
3                   A girlfriend can be ===green===.      0   
4               His girlfriend was very ===hater===.      0   

           label_raw dimension    category                   source language  \
0  stereotype_gender    gender  stereotype  stereoset_intersentence       en   
1  stereotype_gender    gender  stereotype  stereoset_intersentence   

In [25]:
df.to_csv(
    "emgsd_selected_en_2000.csv",
    index=False,
    encoding="utf-8-sig"
)

In [26]:
df.to_json(
    "emgsd_selected_en_2000.json",
    orient="records",
    force_ascii=False,
    indent=2
)


In [34]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import time

# ======================
# ‚úÖ 1Ô∏è‚É£ Read DeepSeek API Key
# ======================
with open("api_deepseek.txt", "r", encoding="utf-8") as f:
    api_key = f.read().strip()

client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com"
)

# ======================
# ‚úÖ 2Ô∏è‚É£ Load English data
# ======================
df = pd.read_csv("emgsd_selected_en_2000.csv")

if "text_zh" not in df.columns:
    df["text_zh"] = ""

# ======================
# ‚úÖ 3Ô∏è‚É£ Batch Translation Prompt (v2)
# ======================
def build_batch_translate_prompt_v2(sentences):
    numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])

    return f"""
You are a professional linguistic annotator.

Please translate EACH of the following English sentences into natural Chinese.

Strict rules:
1. Preserve the original stereotype meaning and strength.
2. Do NOT add any new social group, target, or attribute.
3. Do NOT neutralize or soften the stereotype implication.
4. Do NOT introduce any China-specific cultural elements.
5. Use natural daily Chinese, not formal written style.
6. Output MUST strictly follow the numbered format below.
7. The number of output lines MUST exactly match the number of input sentences.

Output format example:
1. ...
2. ...
3. ...

Sentences:
{numbered}
""".strip()

# ======================
# ‚úÖ 4Ô∏è‚É£ Safe batch translation function (DeepSeek)
# ======================
def safe_llm_translate_batch(prompt):
    max_retry = 6
    wait_time = 5

    for attempt in range(max_retry):
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.3
            )
            return response.choices[0].message.content.strip()

        except Exception as e:
            print(f"‚ö†Ô∏è DeepSeek API Error, retry {attempt+1}/{max_retry}: {e}")
            time.sleep(wait_time)
            wait_time *= 2

    return ""

# ======================
# ‚úÖ 5Ô∏è‚É£ Main Batch Translation Loop (50 per call)
# ======================
BATCH_SIZE = 50  

pending_indices = df[df["text_zh"] == ""].index.tolist()

print(f"‚úÖ Remaining to translate: {len(pending_indices)}")

for i in tqdm(range(0, len(pending_indices), BATCH_SIZE)):
    batch_ids = pending_indices[i:i+BATCH_SIZE]
    batch_texts = df.loc[batch_ids, "text_en"].tolist()

    prompt = build_batch_translate_prompt_v2(batch_texts)
    output = safe_llm_translate_batch(prompt)

    if output == "":
        print("‚ùå Empty output, skipping this batch")
        continue

    lines = [x.strip() for x in output.split("\n") if "." in x]

    if len(lines) != len(batch_texts):
        print(f"‚ö†Ô∏è Mismatch: input {len(batch_texts)} vs output {len(lines)}")
        continue

    for j, line in enumerate(lines):
        zh = line.split(".", 1)[-1].strip()
        df.loc[batch_ids[j], "text_zh"] = zh


    df.to_csv("emgsd_selected_zh_2000.csv", index=False, encoding="utf-8-sig")

    time.sleep(5)  

print("üéâ All batch translations finished with DeepSeek!")


‚úÖ Remaining to translate: 2000


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [20:15<00:00, 30.38s/it]

üéâ All batch translations finished with DeepSeek!





In [35]:
import pandas as pd
import json

# =========================
# 1Ô∏è‚É£ Load translated EMGSD Chinese data (already 0/1 standard)
# =========================
df_trans = pd.read_csv("emgsd_selected_zh_2000.csv")

df_trans_final = pd.DataFrame({
    "id_zh": df_trans["id_en"],           # reuse en id as zh id
    "text_zh": df_trans["text_zh"],
    "text_with_marker": df_trans["text_with_marker"],
    "label": df_trans["label"],           # already 0/1
    "label_raw": df_trans["label_raw"],
    "dimension": df_trans["dimension"],
    "category": df_trans["category"],
    "source": df_trans["source"],
    "language": "zh",
    "split": "unsplit"
})

print(f"‚úÖ Loaded translated EMGSD Zh: {len(df_trans_final)}")

# =========================
# 2Ô∏è‚É£ Load LLM-generated datasets
# =========================
df_age = pd.read_csv("generated_age.csv")
df_region = pd.read_csv("generated_region.csv")
df_edu = pd.read_csv("generated_education.csv")

df_llm = pd.concat([df_age, df_region, df_edu], ignore_index=True)
print(f"‚úÖ Loaded LLM-generated: {len(df_llm)}")

# =========================
# 3Ô∏è‚É£ ‚úÖ Sanity check: label must be 0 or 1
# =========================
invalid_labels = df_llm[~df_llm["label"].isin([0, 1])]
if len(invalid_labels) > 0:
    raise ValueError("‚ùå Found invalid labels in LLM data! Only 0 and 1 are allowed.")

# =========================
# 4Ô∏è‚É£ Normalize LLM structure to EMGSD format
# =========================
start_id = len(df_trans_final)

df_llm_final = pd.DataFrame({
    "id_zh": [f"zh_{start_id + i:06d}" for i in range(len(df_llm))],
    "text_zh": df_llm["text_zh"],
    "text_with_marker": [""] * len(df_llm),   # LLM data has no marker
    "label": df_llm["label"],                 # ‚úÖ already correct 0/1
    "label_raw": df_llm["dimension"].apply(lambda x: f"stereotype_{x}"),
    "dimension": df_llm["dimension"],
    "category": df_llm["label"].apply(lambda x: "stereotype" if x == 1 else "neutral"),
    "source": df_llm["source"],
    "language": "zh",
    "split": ["unsplit"] * len(df_llm)
})

# =========================
# 5Ô∏è‚É£ Merge all into final dataset
# =========================
df_final = pd.concat([df_trans_final, df_llm_final], ignore_index=True)

print(f"üéâ Final Chinese Dataset Size: {len(df_final)}")

# =========================
# 6Ô∏è‚É£ Save as CSV + JSON
# =========================
df_final.to_csv("final_emgsd_zh.csv", index=False, encoding="utf-8-sig")

with open("final_emgsd_zh.json", "w", encoding="utf-8") as f:
    json.dump(df_final.to_dict(orient="records"), f, ensure_ascii=False, indent=2)

print("‚úÖ Saved:")
print(" - final_emgsd_zh.csv")
print(" - final_emgsd_zh.json")


‚úÖ Loaded translated EMGSD Zh: 2000
‚úÖ Loaded LLM-generated: 440
üéâ Final Chinese Dataset Size: 2440
‚úÖ Saved:
 - final_emgsd_zh.csv
 - final_emgsd_zh.json


In [37]:
import json
import random
from collections import defaultdict
from pathlib import Path

# =========================
# ‚úÖ 1Ô∏è‚É£ Config (UPDATED PATH)
# =========================
INPUT_FILE = "final_emgsd_zh.json"
OUTPUT_DIR = "train_dev_test"

TRAIN_RATIO = 0.8
DEV_RATIO = 0.1
TEST_RATIO = 0.1

SEED = 42
random.seed(SEED)

# ‚úÖ Ensure output directory exists
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# =========================
# ‚úÖ 2Ô∏è‚É£ Load full dataset
# =========================
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"‚úÖ Loaded {len(data)} total samples from {INPUT_FILE}")

# =========================
# ‚úÖ 3Ô∏è‚É£ Stratified grouping by (dimension, label)
# =========================
buckets = defaultdict(list)

for x in data:
    key = (x["dimension"], x["label"])
    buckets[key].append(x)

# =========================
# ‚úÖ 4Ô∏è‚É£ Stratified split
# =========================
train_set, dev_set, test_set = [], [], []

for (dimension, label), samples in buckets.items():
    n = len(samples)
    random.shuffle(samples)

    n_train = int(n * TRAIN_RATIO)
    n_dev = int(n * DEV_RATIO)
    n_test = n - n_train - n_dev

    train_set.extend(samples[:n_train])
    dev_set.extend(samples[n_train:n_train + n_dev])
    test_set.extend(samples[n_train + n_dev:])

    print(
        f"üìä {dimension:<12} | label={label} | total={n:<4} "
        f"‚Üí train={n_train}, dev={n_dev}, test={n_test}"
    )

# =========================
# ‚úÖ 5Ô∏è‚É£ Shuffle final splits
# =========================
random.shuffle(train_set)
random.shuffle(dev_set)
random.shuffle(test_set)

# =========================
# ‚úÖ 6Ô∏è‚É£ Save to Data/train_dev_test/
# =========================
with open(f"{OUTPUT_DIR}/train.json", "w", encoding="utf-8") as f:
    json.dump(train_set, f, ensure_ascii=False, indent=2)

with open(f"{OUTPUT_DIR}/dev.json", "w", encoding="utf-8") as f:
    json.dump(dev_set, f, ensure_ascii=False, indent=2)

with open(f"{OUTPUT_DIR}/test.json", "w", encoding="utf-8") as f:
    json.dump(test_set, f, ensure_ascii=False, indent=2)

print("\n‚úÖ Split finished!")
print(f"‚úÖ Train: {len(train_set)}")
print(f"‚úÖ Dev:   {len(dev_set)}")
print(f"‚úÖ Test:  {len(test_set)}")
print(f"üìÅ Saved to folder: {OUTPUT_DIR}/")


‚úÖ Loaded 2440 total samples from final_emgsd_zh.json
üìä gender       | label=1 | total=268  ‚Üí train=214, dev=26, test=28
üìä gender       | label=0 | total=532  ‚Üí train=425, dev=53, test=54
üìä profession   | label=0 | total=535  ‚Üí train=428, dev=53, test=54
üìä profession   | label=1 | total=265  ‚Üí train=212, dev=26, test=27
üìä nationality  | label=0 | total=280  ‚Üí train=224, dev=28, test=28
üìä nationality  | label=1 | total=120  ‚Üí train=96, dev=12, test=12
üìä age          | label=1 | total=72   ‚Üí train=57, dev=7, test=8
üìä age          | label=0 | total=48   ‚Üí train=38, dev=4, test=6
üìä region       | label=1 | total=112  ‚Üí train=89, dev=11, test=12
üìä region       | label=0 | total=48   ‚Üí train=38, dev=4, test=6
üìä education    | label=1 | total=80   ‚Üí train=64, dev=8, test=8
üìä education    | label=0 | total=80   ‚Üí train=64, dev=8, test=8

‚úÖ Split finished!
‚úÖ Train: 1949
‚úÖ Dev:   240
‚úÖ Test:  251
üìÅ Saved to folder: train_dev