# Sibilant + stop Deaspiration Phenomenon Selection

Here we want to work out how we can select only those instances (words) with only target seqs. But one problem is that we don't have teh exact recording files on that granularity level. We only have cut words and cut phones. But our target is something like two or three phones. This is a problem. 

However, considering that our target is not very long, I am thinking of finding all valid instances and integrate them into recordings. Then each time we train, read from the integrated recordings. 

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch
import torchaudio
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import random
from IPython.display import Audio
from tqdm import tqdm
import seaborn as sns
import pandas as pd
from collections import Counter
from sklearn.manifold import TSNE   # one type of clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from itertools import combinations
from sklearn.decomposition import PCA
from scipy.linalg import block_diag
import pickle
from scipy import stats
from model_padding import generate_mask_from_lengths_mat, mask_it
from paths import *
from misc_my_utils import *
from model_loss import *
from model_model import CTCPredNetV1 as TheLearner
from model_dataset import WordDatasetPath as ThisDataset
from model_dataset import Normalizer, DeNormalizer, TokenMap
from model_dataset import MelSpecTransformDB as TheTransform
from model_dataset import DS_Tools
from reshandler import DictResHandler
from misc_progress_bar import draw_progress_bar
from test_bnd_detect_tools import *
from misc_tools import PathUtils as PU
from misc_tools import AudioCut, ARPABET
import re

In [3]:
rec_dir = train_cut_word_
train_guide_path = os.path.join(src_, "guide_train.csv")
valid_guide_path = os.path.join(src_, "guide_validation.csv")
test_guide_path = os.path.join(src_, "guide_test.csv")

In [4]:
# read in guide file
guide_file = pd.read_csv(valid_guide_path)
# filtering out is not necessary, since we only include wuid for encoded words
guide_file = guide_file[~guide_file["segment_nostress"].isin(["sil", "sp", "spn"])]

In [5]:
# words_guide = guide_file.groupby('wuid').apply(lambda x: ([row["segment"] for index, row in x.iterrows()]).tolist()
words_guide_str = guide_file.groupby('wuid').apply(lambda x: (" ".join([row["segment"] for index, row in x.iterrows()]), x["wuid"].iloc[0])).tolist()

In [6]:
def regex_span_to_list_indices(phoneme_str, pattern):
    # Split the string into a list of phonemes
    phonemes = phoneme_str.split()
    # Calculate the cumulative lengths including spaces (add 1 for each space)
    cumulative_lengths = [0]  # Start with 0 for the first phoneme
    for phoneme in phonemes:
        # Add the length of the current phoneme and a space (except for the last one)
        cumulative_lengths.append(cumulative_lengths[-1] + len(phoneme) + 1)
    # Find all matches using re.finditer
    matches = list(re.finditer(pattern, phoneme_str))
    # Map regex span indices to phoneme list indices
    match_indices = []
    for match in matches:
        start, end = match.span()
        # Find the phoneme list index corresponding to the start of the match
        list_start = next(i for i, length in enumerate(cumulative_lengths) if length > start) - 1
        # Find the phoneme list index corresponding to the end of the match (subtract 1 because end is exclusive)
        list_end = next(i for i, length in enumerate(cumulative_lengths) if length >= end) - 1
        match_indices.append((list_start, list_end))
    return match_indices

In [62]:
# stop_pattern = '(?!S) [PTK] (?!R)'
# sibstop_pattern = 'S [PTK] (?!R)'
# Xstop_pattern = stop_pattern
# note that although we only list single-letter vowels, 
# we in fact include all vowels because the all vowels start with one of the listed letters
# the subidx always include pre-stop-vowel. But for Xstop, we don't need the pre
Xstop_pattern = '[^S] [PTK] [AOEIUY]'
stop_pattern = '^[PTK] [AOEIUY]'
sibstop_pattern = 'S [PTK] [AOEIUY]'

It seems that we have wai da zheng zhao-ed, although the way of selecting was quite wrong if we add any more phoneme, the selected results seem quite right. The only problem is that it seems that the ST sequences have also been included in the XT set. 

Note the difference between stop and Xstop. Stop is at word beginning (but sadly they were not used during previous runnings), and Xstop is word middle.

In [63]:
stop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(stop_pattern, word)]
Xstop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(Xstop_pattern, word)]
sibstop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]

stop_subidx = [regex_span_to_list_indices(word, stop_pattern) for i, (word, name) in enumerate(words_guide_str) if re.search(stop_pattern, word)]
Xstop_subidx = [regex_span_to_list_indices(word, Xstop_pattern) for i, (word, name) in enumerate(words_guide_str) if re.search(Xstop_pattern, word)]
sibstop_subidx = [regex_span_to_list_indices(word, sibstop_pattern) for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]

Here we have the risk of including /t/ for ST but excluding it in XT. 

New selection: this time, we select only those preceding vowels. THerefore, during evaluation, we need to account for the vowels. 

In [64]:
len(stop_indices), len(Xstop_indices), len(sibstop_indices)

(9859, 7050, 2361)

In [65]:
stop_subidx[0], Xstop_subidx[0], sibstop_subidx[0]

([(0, 1)], [(2, 4)], [(2, 4)])

In the new version, we also include the following vowels as part of the training set. THis will introduce more noise, but if the trainign is also successful, we can check the attention performances towards both sides. 

In [67]:
def generate_table(df, name_list, target_idx_list, has_pre=True): 
    pre_list = []   # pre can be sibilant or others
    pre_path = []
    pre_startTime = []
    pre_endTimte = []
    stop_list = []
    stop_path = []
    stop_startTime = []
    stop_endTime = []
    vowel_list = []
    vowel_path = []
    vowel_startTime = []
    vowel_endTime = []
    speaker_list = []
    wuid_list = []
    if has_pre:
        for name, target_idx in zip(name_list, target_idx_list): 
            # this is one word, there might be multiple matching cases
            word_phonemes = df[df["wuid"] == name]
            for target in target_idx: 
                target = [i + 1 for i in target]    # Add 1 here because in_id starts from 1
                target_phonemes = word_phonemes[word_phonemes["in_id"].isin(range(target[0], target[1] + 1))]   # the span includes both start and end
                pre = target_phonemes.iloc[0]
                stop = target_phonemes.iloc[1]
                vowel = target_phonemes.iloc[2]
                pre_list.append(pre["segment_nostress"])
                pre_path.append(pre["phone_path"])
                pre_startTime.append(pre["startTime"])
                pre_endTimte.append(pre["endTime"])

                stop_list.append(stop["segment_nostress"])
                stop_path.append(stop["phone_path"])
                stop_startTime.append(stop["startTime"])
                stop_endTime.append(stop["endTime"])

                vowel_list.append(vowel["segment_nostress"])
                vowel_path.append(vowel["phone_path"])
                vowel_startTime.append(vowel["startTime"])
                vowel_endTime.append(vowel["endTime"])

                speaker_list.append(stop["speaker"])
                wuid_list.append(name)
        out_dict = {
            "pre": pre_list, 
            "stop": stop_list,
            "vowel": vowel_list, 
            "pre_path": pre_path, 
            "stop_path": stop_path, 
            "vowel_path": vowel_path,
            "pre_startTime": pre_startTime, 
            "pre_endTime": pre_endTimte, 
            "stop_startTime": stop_startTime,
            "stop_endTime": stop_endTime,
            "vowel_startTime": vowel_startTime,
            "vowel_endTime": vowel_endTime,
            "speaker": speaker_list,
            "wuid": wuid_list
        }
        outdf = pd.DataFrame(out_dict)
    else:
        for name, target_idx in zip(name_list, target_idx_list): 
            # this is one word, there might be multiple matching cases
            word_phonemes = df[df["wuid"] == name]
            for target in target_idx: 
                target = [i + 1 for i in target]    # Add 1 here because in_id starts from 1
                target_phonemes = word_phonemes[word_phonemes["in_id"].isin(range(target[0], target[1] + 1))]
                stop = target_phonemes.iloc[0]
                vowel = target_phonemes.iloc[1]
                pre_list.append("")
                pre_path.append("")
                pre_startTime.append("")
                pre_endTimte.append("")

                stop_list.append(stop["segment_nostress"])
                stop_path.append(stop["phone_path"])
                stop_startTime.append(stop["startTime"])
                stop_endTime.append(stop["endTime"])

                vowel_list.append(vowel["segment_nostress"])
                vowel_path.append(vowel["phone_path"])
                vowel_startTime.append(vowel["startTime"])
                vowel_endTime.append(vowel["endTime"])

                speaker_list.append(stop["speaker"])
                wuid_list.append(name)
        out_dict = {
            "pre": pre_list, 
            "stop": stop_list,
            "vowel": vowel_list, 
            "pre_path": pre_path, 
            "stop_path": stop_path, 
            "vowel_path": vowel_path,
            "pre_startTime": pre_startTime, 
            "pre_endTime": pre_endTimte, 
            "stop_startTime": stop_startTime,
            "stop_endTime": stop_endTime,
            "vowel_startTime": vowel_startTime,
            "vowel_endTime": vowel_endTime,
            "speaker": speaker_list,
            "wuid": wuid_list
        }
        outdf = pd.DataFrame(out_dict)
    return outdf

In [68]:
sibstopsdf = generate_table(guide_file, sibstop_indices, sibstop_subidx, has_pre=True)

In [70]:
Xstopsdf = generate_table(guide_file, Xstop_indices, Xstop_subidx, has_pre=True)    # this may not be used. Because variation is too large in terms of aspiration. 

In [72]:
stopsdf = generate_table(guide_file, stop_indices, stop_subidx, has_pre=False)

In [74]:
sibstopsdf["phi_type"] = "ST"
Xstopsdf["phi_type"] = "XT"
stopsdf["phi_type"] = "T"

In [75]:
stopsdf.to_csv(os.path.join(src_, "phi-T-guide.csv"), index=False)
sibstopsdf.to_csv(os.path.join(src_, "phi-ST-guide.csv"), index=False)
Xstopsdf.to_csv(os.path.join(src_, "phi-XT-guide.csv"), index=False)