# Install dependencies which are needed later


In [3]:
!pip install textgrid transformers librosa
!pip install speechbrain

Collecting textgrid
  Downloading TextGrid-1.5-py3-none-any.whl (10.0 kB)
Installing collected packages: textgrid
Successfully installed textgrid-1.5
Collecting speechbrain
  Downloading speechbrain-0.5.15-py3-none-any.whl (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.8/553.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Collecting sentencepiece (from speechbrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.5-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.4/116.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ruamel.yaml.

# Import modules

In [18]:
#import necessary libraries
from google.colab import drive, files
import torch
import os
import csv
from glob import glob
from textgrid import TextGrid, IntervalTier
from speechbrain.dataio.dataio import read_audio
import re
import copy
from collections import defaultdict
import pdb
from tqdm.auto import tqdm

# Mount the google drive

In [2]:
# Mount drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Define Constants

In [6]:
folder_path = '/content/drive/MyDrive/CS5647_Project'
os.chdir(folder_path)
current_directory = os.getcwd()
print("Current Working Directory after change:", current_directory)
dataset_path = '/content/drive/MyDrive/CS5647_Project/dataset'
speaker_ids = ['ASI', 'RRBI','SVBI','TNI', 'BWC', 'LXC', 'NCC', 'TXHC']
csv_data_path = '/content/drive/MyDrive/CS5647_Project/data/processed_data.csv'
AUDIO_SAMPLE_RATE = 44100
phn_set="/content/drive/MyDrive/CS5647_Project/arpa_phonemes"

Current Working Directory after change: /content/drive/MyDrive/CS5647_Project


# Read the list of ARPA phonemes already defined

In [7]:
def process_arpa_phoneme(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    arpa_phonemes= []
    for line in lines:
        items = line.strip().split()
        arpa_phonemes.append(items[0])
    return arpa_phonemes


In [8]:
ARPA_PHONEMES = process_arpa_phoneme(phn_set)

# Util functions


In [9]:
def is_sil(s: str) -> bool:
    """Test if the input string represents silence.
    Args:
        s: A phoneme label.
    Returns:
        True if is silence, otherwise False.
    """
    if s.lower() in {"sil", "sp", "spn", "pau", ""}:
        return True
    else:
        return False

In [10]:
def normalize_phone(s: str, is_rm_annotation=True, is_phoneme_canonical=True,keep_artificial_sil=False) -> str:
  """Normalize phoneme labels to lower case, stress-free form.
    This will also deal with L2-ARCTIC annotations.
    Args:
        s: A phoneme annotation.
        is_rm_annotation: [optional] Only return the canonical pronunciation if
        set to true, otherwise will keep the annotations.
        is_phoneme_canonical: [optional] If set to true, return canonical phoneme; otherwise
        return perceived phoneme.
        keep_artificial_sil: If true, will keep the artificial sil produced by the way L2ARCTIC was annotated.
                            If false, will not have the sil
                            e.g. when false, 'ah, sil, d' canonical: ah, perceived: None
                                 when true, 'ah, sil, d' canonical: ah, perceived: sil
    Returns:
        Normalized phoneme (canonical pronunciation or with annotations).
  """
  t = s.lower()
  pattern = re.compile(r"[^a-z,]")
  parse_tag = pattern.sub("", t)
  if is_sil(parse_tag):
      return "sil"
  if len(parse_tag) == 0:
      raise ValueError("Input %s is invalid.", s)
  if len(parse_tag.split(",")) == 1:
      if parse_tag.split(",")[0] == 'ax':
          return 'ah'
      else:
          return parse_tag.split(",")[0]
  if is_rm_annotation:
      # This handles the L2-ARCTIC annotations, here we extract the canonical
      # pronunciation
      if keep_artificial_sil:
          if is_phoneme_canonical:
              return parse_tag.split(",")[0]
          else:
              return parse_tag.split(",")[1]
      elif not keep_artificial_sil:
          if is_phoneme_canonical:
              if parse_tag.split(",")[2] in ['s', 'd']:
                  return parse_tag.split(",")[0]
              elif parse_tag.split(",")[2] == 'a':
                  return None
          else:
              if parse_tag.split(",")[2] in ['s', 'a']:
                  return parse_tag.split(",")[1]
              elif parse_tag.split(",")[2] == 'd':
                  return None
  else:
      return parse_tag


In [11]:
def normalize_tier_mark(tier: IntervalTier,
                        mode="NormalizePhoneCanonical", keep_artificial_sil=False) -> IntervalTier:
    """Normalize the marks of an IntervalTier.
    Refer to the code for supported modes.
    Args:
        tier: An IntervalTier object.
        mode: The filter function for each mark in the tier.
    Returns:
        tier: Mark-normalized tier.
    """
    tier = copy.deepcopy(tier)
    tier_out = IntervalTier()
    if mode not in {"NormalizePhoneCanonical",
                    "NormalizePhonePerceived",
                    "NormalizePhoneAnnotation",
                    "NormalizeWord"}:
        raise ValueError("Mode %s is not valid.", mode)
    for each_interval in tier.intervals:
        if mode == "NormalizePhoneCanonical":
            # Only keep the canonical pronunciation.
            p = normalize_phone(each_interval.mark, True, True, keep_artificial_sil)
        elif mode == "NormalizePhonePerceived":
            # Only keep the perceived pronunciation.
            p = normalize_phone(each_interval.mark, True, False, keep_artificial_sil)
        elif mode == "NormalizePhoneAnnotation":
            # Keep the annotations.
            p = normalize_phone(each_interval.mark, False)

        if p is None:
            continue
        if p == 'ax':
            p = 'ah'
        each_interval.mark = p
        assert p in ARPA_PHONEMES + ["err"], pdb.set_trace()
        tier_out.addInterval(each_interval)
    return tier_out


In [12]:
def tier_to_list(tier):
    return [interval.mark for interval in tier]

def remove_repetitive_sil(phone_list):
    # Filtering out consecutive silences by applying a mask with `True` marking
    # which sils to remove
    # e.g.
    # phone_list          [  "a", "sil", "sil",  "sil",   "b"]
    # ---
    # create:
    # remove_sil_mask   [False,  True,  True,  False,  False]
    # ---
    # so end result is:
    # phone_list ["a", "sil", "b"]

    remove_sil_mask = [True if x == "sil" else False for x in phone_list]

    for i, val in enumerate(remove_sil_mask):
        if val is True:
            if i == len(remove_sil_mask) - 1:
                remove_sil_mask[i] = False
            elif remove_sil_mask[i + 1] is False:
                remove_sil_mask[i] = False

    phone_list = [
        phon for i, phon in enumerate(phone_list) if not remove_sil_mask[i]
    ]
    return phone_list

In [13]:
def get_phonemes(tg, keep_artificial_sil=False, rm_repetitive_sil=True):
    phone_tier = tg.getFirst("phones")
    perceived_phones = normalize_tier_mark(phone_tier, "NormalizePhonePerceived", keep_artificial_sil)
    canonical_phones = normalize_tier_mark(phone_tier, "NormalizePhoneCanonical", keep_artificial_sil)
    canonical_phones = tier_to_list(canonical_phones)
    perceived_phones = tier_to_list(perceived_phones)
    if keep_artificial_sil:
        # when we preserve the artificial sils, the canonical phones and
        # perceived phones should be perfectly aligned
        assert len(canonical_phones) == len(perceived_phones)
    if rm_repetitive_sil:
        canonical_phones = remove_repetitive_sil(canonical_phones)
        perceived_phones = remove_repetitive_sil(perceived_phones)
    return " ".join(canonical_phones), " ".join(perceived_phones)


In [14]:
def process_annotation_data(tg, wav_file, text_file, spkr):
  row_data = {}
  row_data['ID'] = wav_file
  row_data["wav"] = wav_file
  # Reading the signal (to retrieve duration in seconds)
  signal = read_audio(wav_file)
  duration = len(signal) / AUDIO_SAMPLE_RATE
  row_data["duration"] = duration
  row_data["spk_id"] = spkr
  ## To keep original human annotation, set `keep_artifical_sil=True`, `rm_repetitive_sil=False`
  ## this preserve the original alignment within the annotations
  cano_phns_align, perc_phns_align = get_phonemes(tg, keep_artificial_sil=True, rm_repetitive_sil=False)
  row_data["canonical_aligned"] = cano_phns_align
  row_data["perceived_aligned"] = perc_phns_align
  ## To get training target phones, set `keep_artifical_sil=False`, `rm_repetitive_sil=True`
  ## this apply some preprocessing on the perceived phones, i.e. rm artifical and repetitive sil
  _, target_phns = get_phonemes(tg, keep_artificial_sil=False, rm_repetitive_sil=True)
  row_data["perceived_train_target"] = target_phns

  with open(text_file, "r") as reader:
      text = reader.readline()
  row_data["wrd"] = text
  return row_data





# Preparing the processed L2 Artic dataset


In [19]:
def prepare_l2arctic(base_dir, output_csv):
  print(f"Creating {output_csv}")
  with open(output_csv, mode='w', newline="") as csv_f:
    fieldnames = ["ID", "wav", "duration", "spk_id","canonical_aligned",
                          "perceived_aligned", "perceived_train_target", "wrd"]
    csv_writer = csv.DictWriter(csv_f, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writeheader()


    for spkr in tqdm(speaker_ids):
      wav_dir = os.path.join(base_dir, spkr, 'wav')
      annotation_dir = os.path.join(base_dir, spkr, 'annotation')
      transcript_dir = os.path.join(base_dir, spkr, 'transcript')

      for tg_file in tqdm(glob(os.path.join(annotation_dir, "*.TextGrid"))):
        tg = TextGrid()
        try:
          tg.read(tg_file)
        except ValueError:
          continue
        base_name = os.path.basename(tg_file).split(".")[0]
        wav_file = os.path.join(wav_dir, base_name + ".wav")
        text_file = os.path.join(transcript_dir, base_name + '.txt')
        row_data = process_annotation_data(tg, wav_file, text_file, spkr)
        csv_writer.writerow(row_data)
      print(f"Succescuffly created for {spkr}!!")

In [20]:
prepare_l2arctic(base_dir=dataset_path,output_csv= csv_data_path)

Creating /content/drive/MyDrive/CS5647_Project/data/processed_data.csv


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for ASI!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for RRBI!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for SVBI!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for TNI!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for BWC!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for LXC!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for NCC!!


  0%|          | 0/150 [00:00<?, ?it/s]

Succescuffly created for TXHC!!


# Now splitting the processed dataset into train, test & val


In [22]:
import pandas as pd

# Read the CSV file containing all data
all_data = pd.read_csv("/content/drive/MyDrive/CS5647_Project/data/processed_data.csv")

In [23]:
train_speaker_ids = ['ASI', 'RRBI', 'BWC', 'LXC']
val_speaker_ids = ['SVBI', 'NCC']
test_speaker_ids = ['TNI', 'TXHC']

In [24]:
# Create separate DataFrames for train, val, and test sets
train_data = all_data[all_data["spk_id"].isin(train_speaker_ids)]
val_data = all_data[all_data["spk_id"].isin(val_speaker_ids)]
test_data = all_data[all_data["spk_id"].isin(test_speaker_ids)]

# Save the split datasets to separate CSV files
train_data.to_csv("/content/drive/MyDrive/CS5647_Project/data/train_data.csv", index=False)
val_data.to_csv("/content/drive/MyDrive/CS5647_Project/data/val_data.csv", index=False)
test_data.to_csv("/content/drive/MyDrive/CS5647_Project/data/test_data.csv", index=False)

print("Train, validation, and test datasets created and saved as train_data.csv, val_data.csv, and test_data.csv respectively.")


Train, validation, and test datasets created and saved as train_data.csv, val_data.csv, and test_data.csv respectively.
