In [16]:
import pandas as pd
import os
import torchaudio
import numpy as np
import glob

from ssd_paths import *
from sound_proc import *
from mio import *
from misc_tools import *
from misc_progress_bar import draw_progress_bar

In [4]:
guide = pd.read_csv(as_phones_extract_path + 'SSB0016.csv')

In [30]:
# guide["rec"].unique().tolist()

## Regenerate anno set data

In [2]:
os.listdir(as_use_path)

['phone_anno_test.csv',
 'phone_anno_test.mfcc',
 'phone_anno_validation.csv',
 'phone_anno_validation.mfcc',
 'phone_random_test.csv',
 'phone_random_test.mfcc',
 'phone_random_train.csv',
 'phone_random_train.mfcc',
 'phone_random_validation.csv',
 'phone_random_validation.mfcc']

In [30]:
guide = pd.read_csv(as_use_path + 'phone_anno_test.csv')

In [31]:
guide["rec"].str.slice(stop=7).unique().tolist()

['SSB0016', 'SSB0043', 'SSB0139', 'SSB0273', 'SSB0375', 'SSB0394']

In [32]:
# annospeakers = ['SSB0033', 'SSB0261', 'SSB0382', 'SSB0393', 'SSB0395', 'SSB0415']
annospeakers = ['SSB0016', 'SSB0043', 'SSB0139', 'SSB0273', 'SSB0375', 'SSB0394']

In [33]:
def open_and_cut(wave_path, wave_name, filtered_df, params, no_real_cut=False):
    # filtered_df = filter_tokens_and_get_df(annos_path, keepSIL=False)
    filtered_df = filtered_df[filtered_df["rec"] == wave_name]
    flat_starts, flat_ends, c_duration = filtered_df["start_time"].to_numpy(), filtered_df["end_time"].to_numpy(), filtered_df["duration"].to_numpy()

    if not no_real_cut: 
        sp = Sound_Proc()
    
        rec, sample_rate = torchaudio.load(wave_path)

        cut_recs = sp.cut_rec(rec, flat_starts, flat_ends)

        # NOTE: This is added because a very small proportion of the data are strangely having zero n_frames (which I don't know yet why)
        # to filter them out, I added this n_frames
        cut_n_frames = [cut_rec.shape[1] for cut_rec in cut_recs]
        cut_n_frames = np.array(cut_n_frames)
    else: 
        pass
        # ref_anno_df = pd.read_csv(os.path.join(word_seg_anno_log_ref_path, os.path.basename(annos_path)))
        # assert ref_anno_df.shape[0] == filtered_df.shape[0]
        # cut_n_frames = ref_anno_df["n_frames"].to_numpy()
        # cut_recs = []
    
    tokens = filtered_df["token"].to_numpy()
    
    cst, cet = flat_starts, flat_ends
    
    
    # Framify
    # Create a dictionary with the three lists as values and the column names as keys
    data = {'rec': wave_name, "idx": list(map("{:08d}".format, range(len(c_duration)))), 'start_time': cst, 'end_time': cet, 'token': tokens, 'duration': c_duration, 'n_frames':cut_n_frames}
    # Create a Pandas DataFrame from the dictionary
    df = pd.DataFrame(data)
    
    return cut_recs, df

In [34]:
filtered_df = filter_tokens_and_get_df(os.path.join(as_phones_extract_path, "log.csv"), keepSIL=False)

In [35]:
my_work_pool = []
for speaker in annospeakers:
    my_work_pool.extend(glob.glob(f"{speaker}*.wav", root_dir=as_wav_path))


In [37]:
total = len(my_work_pool)

for idx, rec_name in enumerate(my_work_pool): 
    rec_raw, ext = os.path.splitext(rec_name)
    draw_progress_bar(idx, total)
    cut_recs, corr_df = open_and_cut(
        os.path.join(as_wav_path, rec_name), 
        rec_raw, filtered_df, 
        None, 
        no_real_cut=False
    )
    save_cut_waves_and_log(
        save_dir=as_phone_seg_anno_new_path, 
        log_dir=as_phone_seg_anno_new_log_path, 
        cut_list=cut_recs, 
        corr_df=corr_df, 
    )



In [28]:
def csv_bind(log_dir): 
    # List all the CSV files in the directory that start with 's'
    directory = log_dir
    csv_files = sorted([f for f in os.listdir(directory) if f.startswith('S') and f.endswith('.csv')])

    # Read and concatenate the CSV files using pandas
    dfs = []
    for file in csv_files:
        df = pd.read_csv(os.path.join(directory, file))
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)

    # Save the concatenated dataframe as "log.csv"
    concatenated_df.to_csv(os.path.join(directory, 'log.csv'), index=False)

In [29]:
csv_bind(as_phone_seg_anno_new_log_path)