# Split Data and Save in train/val/test csvs

In [1]:
# %% Parameter Definitions
import os

# Folder containing the CSV files to process
INPUT_FOLDER = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/"

# Train, validation, and test split numbers (e.g., 2 for train, 2 for validation, 2 for test)
SPLIT_NUMS = [4, 1, 1]  # This means each Dialect_Word must have 6 rows in total

# Folder to save the split results
OUTPUT_FOLDER = "/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/basic/"


In [3]:
# %% CSV Splitting Code Block
import os
import glob
import pandas as pd

# Expected first four column titles
EXPECTED_COLS = ['Dialect_Word', 'SAE_Word', 'Dialect_Prompt', 'SAE_Prompt']

# Find all CSV files in the input folder
csv_files = glob.glob(os.path.join(INPUT_FOLDER, "*.csv"))
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {INPUT_FOLDER}")

# Total number of rows required per Dialect_Word (sum of split numbers)
required_count = sum(SPLIT_NUMS)

for csv_file in csv_files:
    # Read CSV file
    df = pd.read_csv(csv_file)
    
    # Check that the first four columns match the expected names
    if list(df.columns[:4]) != EXPECTED_COLS:
        raise ValueError(f"File {csv_file} does not have the required first four columns: {EXPECTED_COLS}")
    
    # Group rows by Dialect_Word (ignoring the header row)
    groups = df.groupby('Dialect_Word', sort=False)
    
    # Check that every Dialect_Word has exactly the required number of rows
    for word, group in groups:
        if len(group) != required_count:
            raise ValueError(f"Dialect_Word '{word}' in file {csv_file} has {len(group)} rows, expected {required_count}")
    
    # Prepare lists to collect split dataframes
    train_list = []
    val_list = []
    test_list = []
    
    # Process each group to split rows based on SPLIT_NUMS
    for word, group in groups:
        # Ensure we maintain the original row order
        group = group.sort_index()
        n_train, n_val, n_test = SPLIT_NUMS
        # Split the group rows into train, val, and test sets
        train_list.append(group.iloc[:n_train])
        val_list.append(group.iloc[n_train:n_train+n_val])
        test_list.append(group.iloc[n_train+n_val:])
    
    # Concatenate each list of dataframes
    train_df = pd.concat(train_list)
    val_df = pd.concat(val_list)
    test_df = pd.concat(test_list)
    
    # Create a new output folder named after the original CSV file (without extension)
    base_name = os.path.splitext(os.path.basename(csv_file))[0]
    out_dir = os.path.join(OUTPUT_FOLDER, base_name)
    os.makedirs(out_dir, exist_ok=True)
    
    # Write the split dataframes to CSV files (keeping the header row)
    train_df.to_csv(os.path.join(out_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(out_dir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(out_dir, "test.csv"), index=False)
    
    print(f"Processed file '{csv_file}' and saved splits to '{out_dir}'")


Processed file '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/aae.csv' and saved splits to '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/basic/aae'
Processed file '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/bre.csv' and saved splits to '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/basic/bre'
Processed file '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/che.csv' and saved splits to '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/basic/che'
Processed file '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/ine.csv' and saved splits to '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/basic/ine'
Processed file '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/basic/sge.csv' and saved splits to '/local1/bryanzhou008/Dialec

# Clean Dataset

In [1]:
import os

def remove_trailing_spaces(folder_path: str) -> None:
    """
    Recursively traverses the directory at folder_path.
    If any sub-folder name ends with a space, removes that space
    and prints the old and new folder paths.
    """
    # Walk the directory tree from the bottom up
    for root, dirs, _ in os.walk(folder_path, topdown=False):
        for dir_name in dirs:
            if dir_name.endswith(" "):
                new_dir_name = dir_name.rstrip(" ")
                old_path = os.path.join(root, dir_name)
                new_path = os.path.join(root, new_dir_name)
                try:
                    os.rename(old_path, new_path)
                    print(f"Renamed: '{old_path}' -> '{new_path}'")
                except Exception as e:
                    print(f"Error renaming '{old_path}': {e}")


In [2]:
remove_trailing_spaces("/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image")

Renamed: '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/a lil baby ' -> '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/a lil baby'
Renamed: '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/a brick crib ' -> '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/a brick crib'
Renamed: '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/me and my small crew ' -> '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/aae/stable-diffusion-3.5-large/dialect_imgs/me and my small crew'
Renamed: '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/image/basic/bre/stable-diffusion-3.5-large/dialect_imgs/a football goalie ' -> '/local1/bryan