# Sibilant + stop Deaspiration Phenomenon Selection

Here we want to work out how we can select only those instances (words) with only target seqs. But one problem is that we don't have teh exact recording files on that granularity level. We only have cut words and cut phones. But our target is something like two or three phones. This is a problem. 

However, considering that our target is not very long, I am thinking of finding all valid instances and integrate them into recordings. Then each time we train, read from the integrated recordings. 

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch
import torchaudio
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import random
from IPython.display import Audio
from tqdm import tqdm
import seaborn as sns
import pandas as pd
from collections import Counter
from sklearn.manifold import TSNE   # one type of clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from itertools import combinations
from sklearn.decomposition import PCA
from scipy.linalg import block_diag
import pickle
from scipy import stats
from model_padding import generate_mask_from_lengths_mat, mask_it
from paths import *
from misc_my_utils import *
from model_loss import *
from model_model import CTCPredNetV1 as TheLearner
from model_dataset import WordDatasetPath as ThisDataset
from model_dataset import Normalizer, DeNormalizer, TokenMap
from model_dataset import MelSpecTransformDB as TheTransform
from model_dataset import DS_Tools
from reshandler import DictResHandler
from misc_progress_bar import draw_progress_bar
from test_bnd_detect_tools import *
from misc_tools import PathUtils as PU
from misc_tools import AudioCut, ARPABET
import re

In [2]:
rec_dir = train_cut_word_
train_guide_path = os.path.join(src_, "guide_train.csv")
valid_guide_path = os.path.join(src_, "guide_validation.csv")
test_guide_path = os.path.join(src_, "guide_test.csv")

In [3]:
# read in guide file
guide_file = pd.read_csv(valid_guide_path)
# filtering out is not necessary, since we only include wuid for encoded words
guide_file = guide_file[~guide_file["segment_nostress"].isin(["sil", "sp", "spn"])]

In [69]:
# words_guide = guide_file.groupby('wuid').apply(lambda x: ([row["segment"] for index, row in x.iterrows()]).tolist()
words_guide_str = guide_file.groupby('wuid').apply(lambda x: (" ".join([row["segment"] for index, row in x.iterrows()]), x["wuid"].iloc[0])).tolist()

In [102]:
def regex_span_to_list_indices(phoneme_str, pattern):
    # Split the string into a list of phonemes
    phonemes = phoneme_str.split()
    # Calculate the cumulative lengths including spaces (add 1 for each space)
    cumulative_lengths = [0]  # Start with 0 for the first phoneme
    for phoneme in phonemes:
        # Add the length of the current phoneme and a space (except for the last one)
        cumulative_lengths.append(cumulative_lengths[-1] + len(phoneme) + 1)
    # Find all matches using re.finditer
    matches = list(re.finditer(pattern, phoneme_str))
    # Map regex span indices to phoneme list indices
    match_indices = []
    for match in matches:
        start, end = match.span()
        # Find the phoneme list index corresponding to the start of the match
        list_start = next(i for i, length in enumerate(cumulative_lengths) if length > start) - 1
        # Find the phoneme list index corresponding to the end of the match (subtract 1 because end is exclusive)
        list_end = next(i for i, length in enumerate(cumulative_lengths) if length >= end) - 1
        match_indices.append((list_start, list_end))
    return match_indices

In [134]:
stop_pattern = '^[PTK] (?!R)'
sibstop_pattern = 'S [PTK] (?!R)'

In [135]:
# Example usage
phoneme_str = "T A S T P A"
match_indices = regex_span_to_list_indices(phoneme_str, sibstop_pattern)
print(match_indices)

[(2, 3)]


In [138]:
stop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(stop_pattern, word)]
sibstop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]
sibstop_subidx = [regex_span_to_list_indices(word, sibstop_pattern) for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]

In [137]:
len(stop_indices), len(sibstop_indices)

(10729, 2606)

In [144]:
sibstops = guide_file[guide_file["wuid"].isin(sibstop_indices)]

In [159]:
sibstops[(sibstops["wuid"] == "103-1240-0001-0036") & (sibstops["in_id"].isin((1, 2)))].iloc[1]

segment                                               AE0
file                                        103-1240-0001
id                                                    130
startTime                                           10.49
endTime                                             10.57
nSample                                              1280
word_id                                              36.0
word                                              cascade
in_id                                                 2.0
segment_nostress                                       AE
stress_type                                             0
phone_path          103/1240/0001/103-1240-0001-0130.flac
word_path           103/1240/0001/103-1240-0001-0036.flac
speaker                                               103
word_startTime                                      10.36
word_endTime                                         11.0
word_nSample                                      10240.0
wuid          

In [None]:
def generate_table(df, name_list, target_idx_list=None): 
    sibilant_list = []
    sibilant_path = []
    sibilant_startTime = []
    sibilant_endTime = []
    stop_list = []
    stop_path = []
    stop_startTime = []
    stop_endTime = []
    speaker_list = []
    wuid_list = []
    for name in name_list: 
        # this is one word, there might be multiple matching cases
        word_phonemes = df[df["wuid"] == name]
        for target in target_idx_list: 
            target_phonemes = word_phonemes[word_phonemes["in_id"].isin(target)]
            sib = target_phonemes.iloc[0]
            stop = target_phonemes.iloc[1]
            sibilant_list.append(sib["segment_nostress"])
            sibilant_path.append(sib["phone_path"])
            sibilant_startTime.append(sib["startTime"])
            sibilant_endTime.append(sib["endTime"])

            stop_list.append(stop["segment_nostress"])
            stop_path.append(stop["phone_path"])
            stop_startTime.append(stop["startTime"])
            stop_endTime.append(stop["endTime"])

            speaker_list.append(sib["speaker"])
            wuid_list.append(name)
    out_dict = {
        "sibilant": sibilant_list, 
        "stop": stop_list, 
        "sibilant_path": sibilant_path, 
        "stop_path": stop_path, 
        "sibilant_startTime": sibilant_startTime, 
        "sibilant_endTime": sibilant_endTime, 
        
    }

In [141]:
guide_file

Unnamed: 0,segment,file,id,startTime,endTime,nSample,word_id,word,in_id,segment_nostress,stress_type,phone_path,word_path,speaker,word_startTime,word_endTime,word_nSample,wuid
1,CH,103-1240-0000,1,0.44,0.57,2080,0.0,chapter,1.0,CH,SNA,103/1240/0000/103-1240-0000-0001.flac,103/1240/0000/103-1240-0000-0000.flac,103,0.44,0.81,5920.0,103-1240-0000-0000
2,AE1,103-1240-0000,2,0.57,0.63,960,0.0,chapter,2.0,AE,1,103/1240/0000/103-1240-0000-0002.flac,103/1240/0000/103-1240-0000-0000.flac,103,0.44,0.81,5920.0,103-1240-0000-0000
3,P,103-1240-0000,3,0.63,0.70,1120,0.0,chapter,3.0,P,SNA,103/1240/0000/103-1240-0000-0003.flac,103/1240/0000/103-1240-0000-0000.flac,103,0.44,0.81,5920.0,103-1240-0000-0000
4,T,103-1240-0000,4,0.70,0.75,800,0.0,chapter,4.0,T,SNA,103/1240/0000/103-1240-0000-0004.flac,103/1240/0000/103-1240-0000-0000.flac,103,0.44,0.81,5920.0,103-1240-0000-0000
5,ER0,103-1240-0000,5,0.75,0.81,960,0.0,chapter,5.0,ER,0,103/1240/0000/103-1240-0000-0005.flac,103/1240/0000/103-1240-0000-0000.flac,103,0.44,0.81,5920.0,103-1240-0000-0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383565,IH1,909-131045-0043,164,14.70,14.79,1440,35.0,opinion,3.0,IH,1,909/131045/0043/909-131045-0043-0164.flac,909/131045/0043/909-131045-0043-0035.flac,909,14.53,15.08,8800.0,909-131045-0043-0035
383566,N,909-131045-0043,165,14.79,14.86,1120,35.0,opinion,4.0,N,SNA,909/131045/0043/909-131045-0043-0165.flac,909/131045/0043/909-131045-0043-0035.flac,909,14.53,15.08,8800.0,909-131045-0043-0035
383567,Y,909-131045-0043,166,14.86,14.94,1280,35.0,opinion,5.0,Y,SNA,909/131045/0043/909-131045-0043-0166.flac,909/131045/0043/909-131045-0043-0035.flac,909,14.53,15.08,8800.0,909-131045-0043-0035
383568,AH0,909-131045-0043,167,14.94,14.98,640,35.0,opinion,6.0,AH,0,909/131045/0043/909-131045-0043-0167.flac,909/131045/0043/909-131045-0043-0035.flac,909,14.53,15.08,8800.0,909-131045-0043-0035
