In [13]:
import pandas as pd
from pathlib import Path
import os
import re # For parsing filenames if needed, though stem is often enough

print("Libraries imported.")

Libraries imported.


In [15]:
# --- Paths and Filenames ---
CORPUS_CSV_PATH = "text_corpus_10k_final.csv" # Your input corpus CSV
TEXTGRID_DIR_PATH = r"C:\Users\vempa\OneDrive\Desktop\ssmt_project_NEW\aligned_output_10k_GT\aligned_output\speaker1"          # Folder containing your TextGrid files
SKIPPED_LOG_PATH = "skipped_files.log"       # Log file generated by the previous script
OUTPUT_CLEANED_CSV_PATH = "text_corpus_UGCE1_10k_cleaned.csv" # Output for the cleaned corpus

# --- Filename Pattern Configuration ---
TEXTGRID_FILE_PREFIX = "file_"
TEXTGRID_NUM_DIGITS = 6 # Number of digits in the filename (e.g., 000003)

# --- Column Names ---
INDEX_COLUMN = "original_index" # Name of the index column in your CSV

print("Configuration set.")

Configuration set.


In [17]:
textgrid_dir = Path(TEXTGRID_DIR_PATH)
existing_basenames = set() # Use a set for fast lookups

if not textgrid_dir.is_dir():
    print(f"Error: TextGrid directory not found at '{TEXTGRID_DIR_PATH}'. Please check the path.")
else:
    print(f"Scanning TextGrid directory: {textgrid_dir}...")
    # Regex to match the pattern and capture the base name (prefix + number)
    # Allows for .textgrid or .TEXTGRID, case-insensitive extension
    file_pattern = re.compile(rf"^({TEXTGRID_FILE_PREFIX}\d{{{TEXTGRID_NUM_DIGITS}}})\.(TEXTGRID|textgrid)$", re.IGNORECASE)

    count = 0
    for item in textgrid_dir.iterdir():
        if item.is_file():
             match = file_pattern.match(item.name)
             if match:
                base_name = match.group(1) # Get the 'file_XXXXXX' part
                existing_basenames.add(base_name)
                count += 1
    print(f"Found {count} files matching the pattern in the directory.")
    # print(f"Sample existing basenames: {list(existing_basenames)[:5]}") # Optional: print sample

Scanning TextGrid directory: C:\Users\vempa\OneDrive\Desktop\ssmt_project_NEW\aligned_output_10k_GT\aligned_output\speaker1...
Found 9248 files matching the pattern in the directory.


In [19]:
skipped_log_file = Path(SKIPPED_LOG_PATH)
skipped_basenames = set() # Use a set

if not skipped_log_file.is_file():
    print(f"Warning: Skipped files log not found at '{SKIPPED_LOG_PATH}'. Assuming no files were skipped.")
else:
    print(f"Reading skipped files log: {skipped_log_file}...")
    try:
        with open(skipped_log_file, 'r', encoding='utf-8') as f:
            for line in f:
                filename = line.strip()
                if filename:
                    # Extract basename from the logged filename (which might include extension)
                    file_path = Path(filename)
                    base_name = file_path.stem # .stem gets filename without extension
                    # Optional: Validate format if needed, but usually stem is enough
                    if base_name.startswith(TEXTGRID_FILE_PREFIX) and len(base_name) == len(TEXTGRID_FILE_PREFIX) + TEXTGRID_NUM_DIGITS:
                         skipped_basenames.add(base_name)
                    else:
                         print(f"  Warning: Skipping malformed line in log: {filename}")
        print(f"Found {len(skipped_basenames)} unique basenames in the skipped files log.")
        # print(f"Sample skipped basenames: {list(skipped_basenames)[:5]}") # Optional: print sample
    except Exception as e:
        print(f"Error reading skipped files log '{SKIPPED_LOG_PATH}': {e}")
        print("Proceeding without skipped file information.")
        skipped_basenames = set() # Reset to empty set on error

Reading skipped files log: skipped_files.log...
Found 77 unique basenames in the skipped files log.


In [23]:
corpus_file = Path(CORPUS_CSV_PATH)
import csv
if not corpus_file.is_file():
    print(f"Error: Corpus CSV file not found at '{CORPUS_CSV_PATH}'. Cannot proceed.")
else:
    print(f"\nLoading corpus CSV: {corpus_file}...")
    try:
        df = pd.read_csv(corpus_file)
        print(f"Loaded corpus with {len(df)} rows.")

        if INDEX_COLUMN not in df.columns:
            print(f"Error: Index column '{INDEX_COLUMN}' not found in the CSV.")
            print(f"Available columns are: {list(df.columns)}")
        else:
            # --- Create the expected TextGrid basename for each row ---
            print("Generating expected TextGrid basenames for filtering...")
            df['expected_basename'] = df[INDEX_COLUMN].apply(
                lambda x: f"{TEXTGRID_FILE_PREFIX}{x:0{TEXTGRID_NUM_DIGITS}d}"
            )

            # --- Apply Filters ---
            print("Applying filters...")
            # Condition 1: The expected basename MUST exist in the directory
            mask_exists = df['expected_basename'].isin(existing_basenames)
            # Condition 2: The expected basename must NOT be in the skipped list
            mask_not_skipped = ~df['expected_basename'].isin(skipped_basenames) # Use ~ for NOT

            # Combine masks: Row must satisfy BOTH conditions
            final_mask = mask_exists & mask_not_skipped

            # Apply the combined mask
            df_cleaned = df[final_mask].copy()

            # Remove the temporary column
            df_cleaned = df_cleaned.drop(columns=['expected_basename'])

            print(f"\nFiltering results:")
            print(f" - Original rows: {len(df)}")
            print(f" - Rows matching existing TextGrids: {mask_exists.sum()}")
            print(f" - Rows matching non-skipped TextGrids: {mask_not_skipped.sum()}")
            print(f" - Rows kept after filtering (Exist AND Not Skipped): {len(df_cleaned)}")


            # --- Save the cleaned DataFrame ---
            output_file = Path(OUTPUT_CLEANED_CSV_PATH)
            print(f"\nSaving cleaned corpus to: {output_file}...")
            # Use the same robust saving parameters as before
            df_cleaned.to_csv(
                output_file,
                index=False,
                encoding='utf-8',
                quoting=csv.QUOTE_ALL,
                doublequote=True,
                escapechar='\\'
            )
            print("Cleaned corpus saved successfully.")

    except Exception as e:
        print(f"An error occurred during CSV loading or processing: {e}")

print("\nScript finished.")


Loading corpus CSV: text_corpus_10k_final.csv...
Loaded corpus with 10000 rows.
Generating expected TextGrid basenames for filtering...
Applying filters...

Filtering results:
 - Original rows: 10000
 - Rows matching existing TextGrids: 9248
 - Rows matching non-skipped TextGrids: 9923
 - Rows kept after filtering (Exist AND Not Skipped): 9171

Saving cleaned corpus to: text_corpus_UGCE1_10k_cleaned.csv...
Cleaned corpus saved successfully.

Script finished.
