# Phone-word boundary alignment
Written: 20231004  
This tool is used to align the phone and word annotations provided in Buckeye dataset. There is no such information provided directly, since they only provide phones and words separately. 

In [1]:
import pandas as pd
import os
import math
from paths import word_seg_anno_log_path, phone_seg_anno_log_path, bsc_path

In [41]:
# Step 1: Read wordlog.csv and phonelog.csv into DataFrames
# wordlog_df = pd.read_csv(os.path.join(word_seg_anno_log_path, 's0101a.csv'))
# phonelog_df = pd.read_csv(os.path.join(phone_seg_anno_log_path, 's0101a.csv'))
wordlog_df = pd.read_csv(os.path.join(word_seg_anno_log_path, 'log.csv'))
phonelog_df = pd.read_csv(os.path.join(phone_seg_anno_log_path, 'log.csv'))

wordlog_df['produced_segments_clean'] = wordlog_df['produced_segments'].str.strip()

# Step 2-6: Iterate through wordlog_df and perform the checks
results = []

for idx, word_row in wordlog_df.iterrows():
    rec = word_row['rec']
    start_time_word = word_row['start_time']
    end_time_word = word_row['end_time']
    if word_row["produced_segments_clean"]=="" or word_row["produced_segments_clean"].isna(): 
        # Step 6: Store the results
        results.append({
            'rec': rec,
            'start_time_word': start_time_word,
            'end_time_word': end_time_word,
            'produced_segments': '',
            'phonemes': '',
            'phoneme_endtimes': '',
            'match_status': 0
        })
        continue

    produced_segments = word_row['produced_segments_clean'].split()

    # Step 4: Filter phonelog_df for matching 'rec' and time range
    phonelog_filtered = phonelog_df[(phonelog_df['rec'] == rec) & (phonelog_df['start_time'] >= start_time_word) & (phonelog_df['end_time'] <= end_time_word)]

    # Step 5: Check if phonemes are included in the produced segments and follow the same order
    phonemes = phonelog_filtered['token'].tolist()
    phoneme_endtimes = phonelog_filtered['end_time'].tolist()
    if phonemes == produced_segments:
        match_status = 1
    else:
        match_status = 0

    # Step 6: Store the results
    results.append({
        'rec': rec,
        'start_time_word': start_time_word,
        'end_time_word': end_time_word,
        'produced_segments': ' '.join(produced_segments),
        'phonemes': ' '.join(phonemes),
        'phoneme_endtimes': ' '.join(map(str, phoneme_endtimes)),
        'match_status': match_status
    })

# Create a DataFrame from the results
result_df = pd.DataFrame(results)

# Save the results to a CSV file
result_df.to_csv('results.csv', index=False)


In [9]:
wordlog_df = pd.read_csv(os.path.join(word_seg_anno_log_path, 'log.csv'))
wordlog_df['produced_segments_clean'] = wordlog_df['produced_segments'].str.strip()

In [31]:
wordlog_df[wordlog_df["produced_segments_clean"].isnull()]

Unnamed: 0,rec,idx,start_time,end_time,token,duration,n_frames,theory_segments,produced_segments,produced_segments_clean
87711,s1301b,681,264.77477,265.407669,filed,0.632899,10127,VBN\n,,
88424,s1302a,24,10.863223,11.165419,that,0.302196,4835,DT\n,,
88775,s1302a,375,140.357933,140.690459,Adam,0.332526,5320,NNP\n,,
93718,s1303b,945,400.652968,400.970684,how,0.317716,5084,WRB\n,,
231722,s3301b,856,379.82897,379.985604,to,0.156634,2506,,,


In [34]:
wordlog_df[(wordlog_df["produced_segments_clean"]!="") & (wordlog_df["produced_segments_clean"].notna())]

Unnamed: 0,rec,idx,start_time,end_time,token,duration,n_frames,theory_segments,produced_segments,produced_segments_clean
0,s0101a,0,32.216575,32.622045,okay,0.405470,6488,ow k ey,k ay,k ay
1,s0101a,1,44.617996,44.946848,um,0.328852,5262,ah m,ah m,ah m
2,s0101a,2,45.355708,45.501487,i'm,0.145779,2333,ay m,ay m,ay m
3,s0101a,3,47.307796,47.530873,lived,0.223077,3569,l ih v d,l ah v d,l ah v d
4,s0101a,4,47.530873,47.658958,in,0.128085,2049,ih n,ih n,ih n
...,...,...,...,...,...,...,...,...,...,...
284642,s4004a,201,84.380602,84.457425,it,0.076823,1229,ih t,ah dx,ah dx
284643,s4004a,202,84.457425,84.553457,or,0.096032,1536,ow r,er,er
284644,s4004a,203,84.553457,84.827827,not,0.274370,4390,n aa t,n aa t,n aa t
284645,s4004a,204,90.183453,90.477742,all,0.294289,4709,aa l,ow,ow


In [26]:
for idx, row in wordlog_df.iterrows(): 
    if not isinstance(row["produced_segments_clean"], str): 
        print(math.isnan(row["produced_segments_clean"]))

True
True
True
True
True


In [35]:
87280/284619

0.3066555641049965