# Phone-word boundary alignment
Written: 20231004  
This tool is used to align the phone and word annotations provided in Buckeye dataset. There is no such information provided directly, since they only provide phones and words separately. 

In [37]:
import pandas as pd
import os
from paths import word_seg_anno_log_path, phone_seg_anno_log_path, bsc_path

In [41]:
# Step 1: Read wordlog.csv and phonelog.csv into DataFrames
# wordlog_df = pd.read_csv(os.path.join(word_seg_anno_log_path, 's0101a.csv'))
# phonelog_df = pd.read_csv(os.path.join(phone_seg_anno_log_path, 's0101a.csv'))
wordlog_df = pd.read_csv(os.path.join(bsc_path, 'word_log.csv'))
phonelog_df = pd.read_csv(os.path.join(bsc_path, 'phone_log.csv'))

wordlog_df['produced_segments_clean'] = wordlog_df['produced_segments'].str.strip()

# Step 2-6: Iterate through wordlog_df and perform the checks
results = []

for idx, word_row in wordlog_df.iterrows():
    rec = word_row['rec']
    start_time_word = word_row['start_time']
    end_time_word = word_row['end_time']
    produced_segments = word_row['produced_segments_clean'].split()

    # Step 4: Filter phonelog_df for matching 'rec' and time range
    phonelog_filtered = phonelog_df[(phonelog_df['rec'] == rec) & (phonelog_df['start_time'] >= start_time_word) & (phonelog_df['end_time'] <= end_time_word)]

    # Step 5: Check if phonemes are included in the produced segments and follow the same order
    phonemes = phonelog_filtered['token'].tolist()
    phoneme_endtimes = phonelog_filtered['end_time'].tolist()
    if phonemes == produced_segments:
        match_status = 1
    else:
        match_status = 0

    # Step 6: Store the results
    results.append({
        'rec': rec,
        'start_time_word': start_time_word,
        'end_time_word': end_time_word,
        'produced_segments': ' '.join(produced_segments),
        'phonemes': ' '.join(phonemes),
        'phoneme_endtimes': ' '.join(map(str, phoneme_endtimes)),
        'match_status': match_status
    })

# Create a DataFrame from the results
result_df = pd.DataFrame(results)

# Save the results to a CSV file
result_df.to_csv('results.csv', index=False)
