In [1]:
!pip install textgrid

Collecting textgrid
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: textgrid
  Building wheel for textgrid (setup.py): started
  Building wheel for textgrid (setup.py): finished with status 'done'
  Created wheel for textgrid: filename=TextGrid-1.6.1-py3-none-any.whl size=10153 sha256=712648e98c1c98b929dce6f4f2c9658427ed98e8e5ee43282d05bc795662dd0a
  Stored in directory: c:\users\vempa\appdata\local\pip\cache\wheels\ce\86\7b\5766bd19fa4b4554667dd186e614b5a438ab14eec9c5a3642a
Successfully built textgrid
Installing collected packages: textgrid
Successfully installed textgrid-1.6.1


In [11]:
import os
from pathlib import Path
import textgrid # Uses the textgrid.py library
import re # For more robust filename checking

# --- Configuration ---
# Adjust these names if your tiers are named differently
WORD_TIER_NAME = "words"
PHONE_TIER_NAME = "phones"

# The special marker to insert between word phoneme groups
WORD_BOUNDARY_MARKER = "<WB>"

# Set of phone symbols to exclude (e.g., noise, silence markers in the phone tier)
# Add any other symbols you want to ignore, besides empty strings ""
EXCLUDE_PHONES = {"spn"}

# Encoding for the output file
OUTPUT_ENCODING = 'utf-8'

# --- Missing File Check Configuration ---
MISSING_FILE_PREFIX = "file_"
MISSING_FILE_START_NUM = 3
MISSING_FILE_END_NUM = 9999
MISSING_FILE_NUM_DIGITS = 6 # Number of digits expected after prefix (e.g., 000003 has 6)
# --- End Configuration ---


print("Configuration loaded.")

Configuration loaded.


In [13]:
def find_and_print_missing_files(input_path: Path):
    """
    Checks for missing files in the specified numerical range and pattern
    within the input directory and prints the missing ones, sorted.
    """
    print("\n--- Missing File Check ---")
    if not input_path.is_dir():
        print(f"Error: Input path '{input_path}' is not a valid directory for checking.")
        print("--- End Missing File Check ---\n")
        return

    expected_basenames = set()
    for i in range(MISSING_FILE_START_NUM, MISSING_FILE_END_NUM + 1):
        # Format number with leading zeros according to NUM_DIGITS
        basename = f"{MISSING_FILE_PREFIX}{i:0{MISSING_FILE_NUM_DIGITS}d}"
        expected_basenames.add(basename)

    print(f"Expecting {len(expected_basenames)} files matching pattern '{MISSING_FILE_PREFIX}{'X'*MISSING_FILE_NUM_DIGITS}'")
    print(f"Range: {MISSING_FILE_PREFIX}{MISSING_FILE_START_NUM:0{MISSING_FILE_NUM_DIGITS}d} to {MISSING_FILE_PREFIX}{MISSING_FILE_END_NUM:0{MISSING_FILE_NUM_DIGITS}d}")

    actual_basenames = set()
    # Regex to strictly match the desired pattern (prefix + correct number of digits)
    # Allows for .textgrid or .TEXTGRID extension
    file_pattern = re.compile(rf"^{MISSING_FILE_PREFIX}(\d{{{MISSING_FILE_NUM_DIGITS}}})\.(TEXTGRID|textgrid)$", re.IGNORECASE)

    for item in input_path.iterdir():
        if item.is_file():
            match = file_pattern.match(item.name)
            if match:
                 # Extract the base name (prefix + digits)
                 base_name_from_file = f"{MISSING_FILE_PREFIX}{match.group(1)}"
                 actual_basenames.add(base_name_from_file)

    missing_basenames = sorted(list(expected_basenames - actual_basenames))

    if not missing_basenames:
        print("Result: No missing files found in the specified range and pattern.")
    else:
        print(f"Result: Found {len(missing_basenames)} missing files:")
        # Limit printing if too many missing files? Optional.
        max_print = 100
        for i, missing_name in enumerate(missing_basenames):
            if i < max_print:
                 print(f"  - {missing_name}")
            elif i == max_print:
                 print(f"  - ... (and {len(missing_basenames) - max_print} more)")
                 break # Stop printing after the limit

    print("--- End Missing File Check ---\n")

print("Missing file check function defined.")

Missing file check function defined.


In [15]:
def extract_phoneme_sequence(tg_path: Path) -> str:
    """
    Extracts the phoneme sequence with word boundaries from a single TextGrid file.
    (Same function as before)
    """
    try:
        tg = textgrid.TextGrid.fromFile(str(tg_path)) # Load the TextGrid
    except Exception as e:
        print(f"Error loading TextGrid file {tg_path.name}: {e}")
        return ""

    # Find the tiers by name (case-sensitive)
    word_tier = None
    phone_tier = None
    for tier in tg:
        if tier.name == WORD_TIER_NAME:
            word_tier = tier
        elif tier.name == PHONE_TIER_NAME:
            phone_tier = tier

    if not isinstance(word_tier, textgrid.IntervalTier):
        # Suppress warning if file is expected to be missing based on earlier check (optional)
        # print(f"Warning: Word tier '{WORD_TIER_NAME}' not found or not IntervalTier in {tg_path.name}")
        return ""
    if not isinstance(phone_tier, textgrid.IntervalTier):
        # Suppress warning (optional)
        # print(f"Warning: Phone tier '{PHONE_TIER_NAME}' not found or not IntervalTier in {tg_path.name}")
        return ""

    sentence_parts = [] # Stores phoneme sequences for each word

    # Iterate through word intervals
    for word_interval in word_tier:
        word_text = word_interval.mark.strip()

        # Skip intervals marked as silence/empty in the word tier
        if not word_text:
            continue

        word_min_time = word_interval.minTime
        word_max_time = word_interval.maxTime
        word_phonemes = []

        # Find corresponding phones within the word's time range
        for phone_interval in phone_tier:
            phone_text = phone_interval.mark.strip()

            # Check if phone is within word time boundaries
            # Using the center point is often robust to slight boundary mismatches
            phone_center = (phone_interval.minTime + phone_interval.maxTime) / 2
            is_within_time = (phone_center >= word_min_time and phone_center < word_max_time)
            # Handle edge case for zero-duration intervals if they align exactly
            if word_min_time == word_max_time and phone_interval.minTime == phone_interval.maxTime == word_min_time:
                 is_within_time = True

            if is_within_time:
                # Exclude empty strings and specified noise markers
                if phone_text and phone_text not in EXCLUDE_PHONES:
                    word_phonemes.append(phone_text)

        if word_phonemes: # Only add if we found phones for this word
            sentence_parts.append(" ".join(word_phonemes))

    # Join the word parts with the boundary marker
    if sentence_parts:
        return f" {WORD_BOUNDARY_MARKER} ".join(sentence_parts)
    else:
        # This warning might be noisy if some TextGrids are intentionally empty/silent
        # print(f"Warning: No valid word/phone sequences constructed for {tg_path.name}")
        return ""

print("Helper function defined.")

Helper function defined.


In [23]:
# CELL 5 (Replacement)

def process_folder(input_folder_str: str, output_file_str: str, skipped_log_file_str: str = None):
    """
    Processes all TextGrid files in a folder and writes the source sequences
    to an output file, after checking for missing files. Logs skipped files.

    Args:
        input_folder_str: Path string to the folder containing TextGrid files.
        output_file_str: Path string to the file where source sequences will be written.
        skipped_log_file_str: Optional path string to a file where names of
                              skipped TextGrids will be logged.
    """
    input_path = Path(input_folder_str)
    output_path = Path(output_file_str)
    skipped_log_path = Path(skipped_log_file_str) if skipped_log_file_str else None

    # --- Call the Missing File Check First ---
    find_and_print_missing_files(input_path)
    # --- End Missing File Check ---


    if not input_path.is_dir():
        print(f"Error: Input path '{input_folder_str}' is not a valid directory. Aborting processing.")
        return

    # Ensure output directory exists if specified in path
    try:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        if skipped_log_path:
             skipped_log_path.parent.mkdir(parents=True, exist_ok=True)
    except Exception as e:
         print(f"Could not create output directory {output_path.parent} or {skipped_log_path.parent if skipped_log_path else ''}. Error: {e}")
         print("Please ensure the path to the output file(s) is valid. Aborting processing.")
         return

    processed_count = 0
    skipped_count = 0
    skipped_files_list = [] # <--- Initialize list to store skipped filenames

    print(f"Starting TextGrid processing for folder: {input_path}")
    try:
        with open(output_path, 'w', encoding=OUTPUT_ENCODING) as outfile:
            # Use the same regex pattern to find files to process
            file_pattern = re.compile(rf"^{MISSING_FILE_PREFIX}(\d{{{MISSING_FILE_NUM_DIGITS}}})\.(TEXTGRID|textgrid)$", re.IGNORECASE)

            # Sort files numerically for consistent processing order
            files_to_process = []
            for item in input_path.iterdir():
                 if item.is_file():
                    match = file_pattern.match(item.name)
                    if match:
                        files_to_process.append(item)

            # Sort based on the numerical part of the filename
            files_to_process.sort(key=lambda p: int(file_pattern.match(p.name).group(1)))


            if not files_to_process:
                print(f"Warning: No files matching the pattern '{MISSING_FILE_PREFIX}{'X'*MISSING_FILE_NUM_DIGITS}.(TEXTGRID|textgrid)' found in {input_path} for processing.")
                return

            print(f"Found {len(files_to_process)} files matching pattern to process.")

            for tg_file in files_to_process:
                phoneme_sequence = extract_phoneme_sequence(tg_file)

                if phoneme_sequence: # Only write if sequence is not empty
                    outfile.write(phoneme_sequence + '\n')
                    processed_count += 1
                else:
                    skipped_count += 1
                    skipped_files_list.append(tg_file.name) # <--- Add filename to list

    except Exception as e:
        print(f"\nAn error occurred during file processing/writing: {e}")
        # Consider whether to continue or abort if writing fails mid-way
        # For now, we'll proceed to print summary and skipped files found so far.
        # return # Uncomment this to abort fully on write error

    print(f"\nTextGrid processing complete.")
    print(f"Successfully processed and wrote sequences for {processed_count} files.")

    # --- Log Skipped Files ---
    if skipped_count > 0:
        print(f"Skipped {skipped_count} files during processing (due to errors loading or no valid sequences found):")
        # Print to console
        for i, fname in enumerate(skipped_files_list):
            if i < 50: # Print first 50 skipped to console
                 print(f"  - {fname}")
            elif i == 50:
                 print(f"  - ... (and {len(skipped_files_list) - 50} more)")
                 break

        # Optionally write to log file
        if skipped_log_path:
            try:
                print(f"Writing list of all skipped files to: {skipped_log_path}")
                with open(skipped_log_path, 'w', encoding=OUTPUT_ENCODING) as logfile:
                    for fname in skipped_files_list:
                        logfile.write(fname + '\n')
            except Exception as e:
                print(f"  Error writing skipped files log: {e}")
    else:
         print("Skipped 0 files during processing.")

    print(f"\nOutput written to: {output_path}")

print("Main processing function (with skipped file logging) defined.")

Main processing function (with skipped file logging) defined.


In [25]:
# --- SET YOUR PATHS HERE ---
input_folder_path = r"C:\Users\vempa\OneDrive\Desktop\ssmt_project_NEW\aligned_output_10k_GT\aligned_output\speaker1"  # CHANGE THIS to the actual path of your folder containing TextGrid files
output_file_path = "corpus1.phones" # CHANGE THIS to the desired name/path for the output file
skipped_log_file_output = "skipped_files.log" # CHANGE THIS or set to None
# --- END SET YOUR PATHS ---

# Run the processing (which includes the missing file check)
process_folder(input_folder_path, output_file_path,skipped_log_file_str=skipped_log_file_output)

print("\nScript finished.")


--- Missing File Check ---
Expecting 9997 files matching pattern 'file_XXXXXX'
Range: file_000003 to file_009999
Result: Found 752 missing files:
  - file_000142
  - file_000490
  - file_000502
  - file_000505
  - file_000515
  - file_000521
  - file_000525
  - file_000682
  - file_000714
  - file_000727
  - file_000734
  - file_000736
  - file_000740
  - file_000741
  - file_000747
  - file_000751
  - file_000752
  - file_000753
  - file_000763
  - file_000764
  - file_000765
  - file_000766
  - file_000767
  - file_000768
  - file_000769
  - file_000771
  - file_000973
  - file_000979
  - file_000984
  - file_001018
  - file_001036
  - file_001087
  - file_001088
  - file_001091
  - file_001092
  - file_001094
  - file_001103
  - file_001111
  - file_001120
  - file_001123
  - file_001128
  - file_001132
  - file_001165
  - file_001175
  - file_001351
  - file_001409
  - file_001498
  - file_001531
  - file_001549
  - file_001561
  - file_001631
  - file_001640
  - file_001662
  - f