# Sibilant + stop Deaspiration Phenomenon Selection

Here we want to work out how we can select only those instances (words) with only target seqs. But one problem is that we don't have teh exact recording files on that granularity level. We only have cut words and cut phones. But our target is something like two or three phones. This is a problem. 

However, considering that our target is not very long, I am thinking of finding all valid instances and integrate them into recordings. Then each time we train, read from the integrated recordings. 

In [160]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch
import torchaudio
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import random
from IPython.display import Audio
from tqdm import tqdm
import seaborn as sns
import pandas as pd
from collections import Counter
from sklearn.manifold import TSNE   # one type of clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from itertools import combinations
from sklearn.decomposition import PCA
from scipy.linalg import block_diag
import pickle
from scipy import stats
from model_padding import generate_mask_from_lengths_mat, mask_it
from paths import *
from misc_my_utils import *
from model_loss import *
from model_model import CTCPredNetV1 as TheLearner
from model_dataset import WordDatasetPath as ThisDataset
from model_dataset import Normalizer, DeNormalizer, TokenMap
from model_dataset import MelSpecTransformDB as TheTransform
from model_dataset import DS_Tools
from reshandler import DictResHandler
from misc_progress_bar import draw_progress_bar
from test_bnd_detect_tools import *
from misc_tools import PathUtils as PU
from misc_tools import AudioCut, ARPABET
import re

In [161]:
rec_dir = train_cut_word_
train_guide_path = os.path.join(src_, "guide_train.csv")
valid_guide_path = os.path.join(src_, "guide_validation.csv")
test_guide_path = os.path.join(src_, "guide_test.csv")

In [162]:
# read in guide file
guide_file = pd.read_csv(valid_guide_path)
# filtering out is not necessary, since we only include wuid for encoded words
guide_file = guide_file[~guide_file["segment_nostress"].isin(["sil", "sp", "spn"])]

In [163]:
# words_guide = guide_file.groupby('wuid').apply(lambda x: ([row["segment"] for index, row in x.iterrows()]).tolist()
words_guide_str = guide_file.groupby('wuid').apply(lambda x: (" ".join([row["segment"] for index, row in x.iterrows()]), x["wuid"].iloc[0])).tolist()

In [164]:
def regex_span_to_list_indices(phoneme_str, pattern):
    # Split the string into a list of phonemes
    phonemes = phoneme_str.split()
    # Calculate the cumulative lengths including spaces (add 1 for each space)
    cumulative_lengths = [0]  # Start with 0 for the first phoneme
    for phoneme in phonemes:
        # Add the length of the current phoneme and a space (except for the last one)
        cumulative_lengths.append(cumulative_lengths[-1] + len(phoneme) + 1)
    # Find all matches using re.finditer
    matches = list(re.finditer(pattern, phoneme_str))
    # Map regex span indices to phoneme list indices
    match_indices = []
    for match in matches:
        start, end = match.span()
        # Find the phoneme list index corresponding to the start of the match
        list_start = next(i for i, length in enumerate(cumulative_lengths) if length > start) - 1
        # Find the phoneme list index corresponding to the end of the match (subtract 1 because end is exclusive)
        list_end = next(i for i, length in enumerate(cumulative_lengths) if length >= end) - 1
        match_indices.append((list_start, list_end))
    return match_indices

In [165]:
stop_pattern = '^[PTK] (?!R)'
sibstop_pattern = 'S [PTK] (?!R)'

In [167]:
stop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(stop_pattern, word)]
sibstop_indices = [name for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]
sibstop_subidx = [regex_span_to_list_indices(word, sibstop_pattern) for i, (word, name) in enumerate(words_guide_str) if re.search(sibstop_pattern, word)]

In [168]:
len(stop_indices), len(sibstop_indices)

(10729, 2606)

In [199]:
def generate_table(df, name_list, target_idx_list=None): 
    sibilant_list = []
    sibilant_path = []
    sibilant_startTime = []
    sibilant_endTime = []
    stop_list = []
    stop_path = []
    stop_startTime = []
    stop_endTime = []
    speaker_list = []
    wuid_list = []
    if target_idx_list is None:
        col_dict = {
            "segment_nostress": "stop", 
            "startTime": "stop_startTime", 
            "endTime": "stop_endTime", 
            "phone_path": "stop_path", 
            "speaker": "speaker", 
            "wuid": "wuid"
        }
        word_phonemes = df[df["wuid"].isin(name_list)]
        word_phonemes["wuid_g"] = word_phonemes["wuid"]
        outdf = word_phonemes.groupby("wuid_g").first().reset_index()
        outdf = outdf[col_dict.keys()]
        outdf = outdf.rename(columns=col_dict)
    else:
        for name, target_idx in zip(name_list, target_idx_list): 
            # this is one word, there might be multiple matching cases
            word_phonemes = df[df["wuid"] == name]
            for target in target_idx: 
                target = [i + 1 for i in target]
                target_phonemes = word_phonemes[word_phonemes["in_id"].isin(target)]
                # print(target_phonemes)
                sib = target_phonemes.iloc[0]
                stop = target_phonemes.iloc[1]
                sibilant_list.append(sib["segment_nostress"])
                sibilant_path.append(sib["phone_path"])
                sibilant_startTime.append(sib["startTime"])
                sibilant_endTime.append(sib["endTime"])

                stop_list.append(stop["segment_nostress"])
                stop_path.append(stop["phone_path"])
                stop_startTime.append(stop["startTime"])
                stop_endTime.append(stop["endTime"])

                speaker_list.append(stop["speaker"])
                wuid_list.append(name)
        out_dict = {
            "sibilant": sibilant_list, 
            "stop": stop_list, 
            "sibilant_path": sibilant_path, 
            "stop_path": stop_path, 
            "sibilant_startTime": sibilant_startTime, 
            "sibilant_endTime": sibilant_endTime, 
            "stop_startTime": stop_startTime,
            "stop_endTime": stop_endTime,
            "speaker": speaker_list,
            "wuid": wuid_list
        }
        outdf = pd.DataFrame(out_dict)
    return outdf

In [184]:
sibstopsdf = generate_table(guide_file, sibstop_indices, sibstop_subidx)

In [200]:
stopsdf = generate_table(guide_file, stop_indices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_phonemes["wuid_g"] = word_phonemes["wuid"]


In [202]:
result_df = pd.concat([sibstopsdf, stopsdf], ignore_index=True, sort=False)

In [206]:
result_df["phi_type"] = np.where(result_df["sibilant"].isna(), "T", "ST")

In [212]:
tg = result_df[result_df["phi_type"] == "T"]
stg = result_df[result_df["phi_type"] == "ST"]

In [213]:
tg.to_csv(os.path.join(src_, "phi-T-guide.csv"), index=False)
stg.to_csv(os.path.join(src_, "phi-ST-guide.csv"), index=False)