# Extract dialect prompt columns

In [2]:
# ----------------- configurable folder paths -----------------
from pathlib import Path

SRC_DIR  = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed")
DEST_DIR = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed_translate")
# -------------------------------------------------------------

import pandas as pd

# Create the destination folder (and any missing parents) if it doesn’t exist
DEST_DIR.mkdir(parents=True, exist_ok=True)

# Process every *.csv file in the source folder
for csv_path in SRC_DIR.glob("*.csv"):
    # Read the file, keep only the Dialect_Prompt column
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    dialect_only = df[["Dialect_Prompt"]]

    # Save under the same filename inside DEST_DIR
    out_path = DEST_DIR / csv_path.name
    dialect_only.to_csv(out_path, index=False)

    print(f"✅ {csv_path.name} → {out_path.relative_to(DEST_DIR.parent)}")


✅ aae.csv → detailed_translate/aae.csv
✅ bre.csv → detailed_translate/bre.csv
✅ che.csv → detailed_translate/che.csv
✅ ine.csv → detailed_translate/ine.csv
✅ sge.csv → detailed_translate/sge.csv


# Check for Overlap and Duplicates

In [22]:
import os
import pandas as pd
from collections import defaultdict

# Define paths (you can modify these as needed)
CONCISE_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/concise"
DETAILED_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed"

# Dictionary to store results
overlap_results = defaultdict(list)

# List all CSV files in the concise folder
csv_filenames = [f for f in os.listdir(CONCISE_DIR) if f.endswith(".csv")]

# Iterate through each CSV file
for filename in csv_filenames:
    # Read the two corresponding CSV files
    concise_df = pd.read_csv(os.path.join(CONCISE_DIR, filename))
    detailed_df = pd.read_csv(os.path.join(DETAILED_DIR, filename))
    
    # Get the set of "Dialect_Prompt" values from the detailed file for fast lookup
    detailed_prompts = set(detailed_df["Dialect_Prompt"].dropna())
    
    # Iterate through concise "Dialect_Prompt" values and check for overlaps
    for idx, row in concise_df.iterrows():
        prompt = row["Dialect_Prompt"]
        if prompt in detailed_prompts:
            overlap_results[filename].append((prompt, idx))  # store the prompt and its row index

# Print results aggregated by file name
for fname, matches in overlap_results.items():
    print(f"--- Overlaps in file: {fname} ---")
    for prompt, idx in matches:
        print(f"[Row {idx}] {prompt}")
    print()


In [29]:
import os
import pandas as pd
from collections import defaultdict

# --- CONFIGURATION ---
INPUT_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/concise"

# --- SCRIPT ---
duplicate_prompts_by_file = defaultdict(list)

for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".csv"):
        filepath = os.path.join(INPUT_DIR, filename)
        df = pd.read_csv(filepath)

        if "Dialect_Prompt" in df.columns:
            # Identify duplicate prompts
            duplicated = df[df.duplicated(subset=["Dialect_Prompt"], keep=False)]
            if not duplicated.empty:
                duplicate_prompts_by_file[filename] = duplicated["Dialect_Prompt"].unique().tolist()

# --- OUTPUT ---
if duplicate_prompts_by_file:
    for file, prompts in duplicate_prompts_by_file.items():
        print(f"📁 {file} has {len(prompts)} duplicate Dialect_Prompt entries:")
        for prompt in prompts:
            print(f"   - {prompt}")
        print()
else:
    print("✅ No duplicate Dialect_Prompt entries found in any CSV file.")


✅ No duplicate Dialect_Prompt entries found in any CSV file.


In [30]:
import os
import pandas as pd
from collections import defaultdict

# --- CONFIGURATION ---
INPUT_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed"

# --- SCRIPT ---
duplicate_prompts_by_file = defaultdict(list)

for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".csv"):
        filepath = os.path.join(INPUT_DIR, filename)
        df = pd.read_csv(filepath)

        if "Dialect_Prompt" in df.columns:
            # Identify duplicate prompts
            duplicated = df[df.duplicated(subset=["Dialect_Prompt"], keep=False)]
            if not duplicated.empty:
                duplicate_prompts_by_file[filename] = duplicated["Dialect_Prompt"].unique().tolist()

# --- OUTPUT ---
if duplicate_prompts_by_file:
    for file, prompts in duplicate_prompts_by_file.items():
        print(f"📁 {file} has {len(prompts)} duplicate Dialect_Prompt entries:")
        for prompt in prompts:
            print(f"   - {prompt}")
        print()
else:
    print("✅ No duplicate Dialect_Prompt entries found in any CSV file.")


✅ No duplicate Dialect_Prompt entries found in any CSV file.


# Gather Rewritten Dialect Prompt

In [3]:
import pandas as pd
from pathlib import Path

# Define base paths
csv_dir = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/rewrite_concise")
prompt_base = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/concise")

# Get all CSV files
csv_files = sorted(csv_dir.glob("*.csv"))

for csv_file in csv_files:
    dialect_name = csv_file.stem  # e.g., "aae", "sge"
    df = pd.read_csv(csv_file)

    updated_prompts = []
    for original_prompt in df["Dialect_Prompt"]:
        prompt_path = (
            prompt_base / dialect_name / "dalle3" / "dialect_imgs" / original_prompt / "revised_prompt.txt"
        )
        if prompt_path.exists():
            new_prompt = prompt_path.read_text(encoding="utf-8").strip()
        else:
            new_prompt = original_prompt  # fallback to original if file not found
        updated_prompts.append(new_prompt)

    df["Dialect_Prompt"] = updated_prompts
    df.to_csv(csv_file, index=False)

"Done updating all CSVs with revised prompts."



OSError: [Errno 36] File name too long: '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/concise/aae/dalle3/dialect_imgs/A little baby of Hispanic descent, with rosy cheeks, sparkling hazel eyes and curly black hair, swaddled in a vibrant yellow blanket. The baby is laying on a soft, plush, azure baby mat, with colorful toys scattered around. An environment filled with warmth, safety, and joy./revised_prompt.txt'

In [None]:
import pandas as pd
from pathlib import Path

# Define base paths
csv_dir = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed")
prompt_base = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/detailed")

# Get all CSV files
csv_files = sorted(csv_dir.glob("*.csv"))

for csv_file in csv_files:
    dialect_name = csv_file.stem  # e.g., "aae", "sge"
    df = pd.read_csv(csv_file)

    updated_prompts = []
    for original_prompt in df["Dialect_Prompt"]:
        prompt_path = (
            prompt_base / dialect_name / "dalle3" / "dialect_imgs" / original_prompt / "revised_prompt.txt"
        )
        if prompt_path.exists():
            new_prompt = prompt_path.read_text(encoding="utf-8").strip()
        else:
            new_prompt = original_prompt  # fallback to original if file not found
        updated_prompts.append(new_prompt)

    df["Dialect_Prompt"] = updated_prompts
    df.to_csv(csv_file, index=False)

"Done updating all CSVs with revised prompts."


✅ All CSV files updated with cleaned revised prompts (no quotes).


# Gather Rewritten SAE Prompt

In [None]:
import pandas as pd
from pathlib import Path

# Define base paths
csv_dir = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/rewrite_concise")
prompt_base = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/concise")

# Get all CSV files
csv_files = sorted(csv_dir.glob("*.csv"))

for csv_file in csv_files:
    dialect_name = csv_file.stem  # e.g., "aae", "sge"
    df = pd.read_csv(csv_file)

    updated_prompts = []
    for original_prompt in df["SAE_Prompt"]:
        prompt_path = (
            prompt_base / dialect_name / "dalle3" / "sae_imgs" / original_prompt / "revised_prompt.txt"
        )
        if prompt_path.exists():
            new_prompt = prompt_path.read_text(encoding="utf-8").strip()
        else:
            new_prompt = original_prompt  # fallback to original if file not found
        updated_prompts.append(new_prompt)

    df["SAE_Prompt"] = updated_prompts
    df.to_csv(csv_file, index=False)

"Done updating all CSVs with revised prompts."



'Done updating all CSVs with revised prompts.'

In [None]:
import pandas as pd
from pathlib import Path

# Define base paths
csv_dir = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed")
prompt_base = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/detailed")

# Get all CSV files
csv_files = sorted(csv_dir.glob("*.csv"))

for csv_file in csv_files:
    dialect_name = csv_file.stem  # e.g., "aae", "sge"
    df = pd.read_csv(csv_file)

    updated_prompts = []
    for original_prompt in df["SAE_Prompt"]:
        prompt_path = (
            prompt_base / dialect_name / "dalle3" / "sae_imgs" / original_prompt / "revised_prompt.txt"
        )
        if prompt_path.exists():
            new_prompt = prompt_path.read_text(encoding="utf-8").strip()
        else:
            new_prompt = original_prompt  # fallback to original if file not found
        updated_prompts.append(new_prompt)

    df["SAE_Prompt"] = updated_prompts
    df.to_csv(csv_file, index=False)

"Done updating all CSVs with revised prompts."

