# Extract dialect prompt columns

In [2]:
# ----------------- configurable folder paths -----------------
from pathlib import Path

SRC_DIR  = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed")
DEST_DIR = Path("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed_translate")
# -------------------------------------------------------------

import pandas as pd

# Create the destination folder (and any missing parents) if it doesn’t exist
DEST_DIR.mkdir(parents=True, exist_ok=True)

# Process every *.csv file in the source folder
for csv_path in SRC_DIR.glob("*.csv"):
    # Read the file, keep only the Dialect_Prompt column
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    dialect_only = df[["Dialect_Prompt"]]

    # Save under the same filename inside DEST_DIR
    out_path = DEST_DIR / csv_path.name
    dialect_only.to_csv(out_path, index=False)

    print(f"✅ {csv_path.name} → {out_path.relative_to(DEST_DIR.parent)}")


✅ aae.csv → detailed_translate/aae.csv
✅ bre.csv → detailed_translate/bre.csv
✅ che.csv → detailed_translate/che.csv
✅ ine.csv → detailed_translate/ine.csv
✅ sge.csv → detailed_translate/sge.csv


# Check for overlap

In [6]:
import os
import pandas as pd
from collections import defaultdict

# Define paths (you can modify these as needed)
CONCISE_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/concise"
DETAILED_DIR = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed"

# Dictionary to store results
overlap_results = defaultdict(list)

# List all CSV files in the concise folder
csv_filenames = [f for f in os.listdir(CONCISE_DIR) if f.endswith(".csv")]

# Iterate through each CSV file
for filename in csv_filenames:
    # Read the two corresponding CSV files
    concise_df = pd.read_csv(os.path.join(CONCISE_DIR, filename))
    detailed_df = pd.read_csv(os.path.join(DETAILED_DIR, filename))
    
    # Get the set of "Dialect_Prompt" values from the detailed file for fast lookup
    detailed_prompts = set(detailed_df["Dialect_Prompt"].dropna())
    
    # Iterate through concise "Dialect_Prompt" values and check for overlaps
    for idx, row in concise_df.iterrows():
        prompt = row["Dialect_Prompt"]
        if prompt in detailed_prompts:
            overlap_results[filename].append((prompt, idx))  # store the prompt and its row index

# Print results aggregated by file name
for fname, matches in overlap_results.items():
    print(f"--- Overlaps in file: {fname} ---")
    for prompt, idx in matches:
        print(f"[Row {idx}] {prompt}")
    print()


--- Overlaps in file: aae.csv ---
[Row 18] a man holla at a girl
[Row 19] a man holla at a woman
[Row 20] a guy holla at a woman
[Row 21] a guy holla at his homie
[Row 22] a holla from the audience
[Row 24] my homie
[Row 27] me and my homie
[Row 28] a homie from childhood
[Row 30] me and my crew
[Row 42] a picture of my bae
[Row 52] a wallet overflowing with dough
[Row 61] a man aiming his piece
[Row 75] a woman wearing grills
[Row 81] a van with light tint
[Row 105] a gat inside a drawer
[Row 121] a hammer with a silencer
[Row 127] some bussin fries
[Row 145] a group picture with the fam
[Row 173] two people with nappiness
[Row 201] two people with dreads

--- Overlaps in file: bre.csv ---
[Row 15] a loo with blue tiles
[Row 19] a plate of bangers and eggs
[Row 48] a pair of purple trousers
[Row 77] a petrol station sign
[Row 102] a school caretaker
[Row 110] a modern electric cooker
[Row 124] a hoarding lit up at night
[Row 126] a bright red hoover in a store
[Row 127] a hoover with 