# Idea

### Режимы работы библиотеки:

1. На вход ей подается текст и на выход получаем фонемы (разные прочтения)
2. На вход подаем фонемы двух типов и получаем их разницу
3. Мы подаем текст/фонемы и логиты (вероятности каждой фонемы в каждом звуковом окне) и получаем транскрипцию

### Хотим учесть:

- cmudict + lextool для первичного преобразования текста в фонемы по словарю/эвристикам
- функции для работы с фонемами - трансляция 61-48-39, one-letter-encoding, diff функции и подсчет операций
- витерби для третьего режима
- эвристики на стыке слов и похожие звуки
- сравнение с фонемами тимита
- разные варианты произношения слова

На будущее:
- сравнить с https://github.com/bootphon/phonemizer на различных бэкендах - используют ли они тоже cmudict внутри или какой-то свой фонемайзер
- ударения - как отдельных слов так и целиком в предложении

### Начальные статы сравнения транскрипций словарных с тимитом:

```
sum(stats["cer"])/len(stats["cer"])
0.24904557704602598

sorted(stats["cer"])[int(len(stats["cer"])/2)]
0.25
```

In [5]:
# 1. Перевести разметку тимита при помощи cmudict

import torch
from glob import glob
from soundfile import read as read_sound

import editdistance

CMU_DICT = "/home/gazay/code/phns/phns/vendor/cmudict/cmudict.dict"
PHNS_MAP = "/home/gazay/code/berloga-dl/accent2/lenin_accent/utils/phones.61-48-39.map"


cmu = {}
for word in open(CMU_DICT).read().split("\n"):
    word_parts = word.split(' ')
    cmu[word_parts[0]] = [''.join(filter(lambda x: not x.isdigit(), phn.lower())) for phn in word_parts[1:]]
    
class TimitDataset(torch.utils.data.Dataset):
    def __init__(self, path):
        self.path = path
        self.wavs = glob(path + '/*/*/*/*.wav')
        
    def __len__(self):
        return len(self.wavs)
    
    def __read_phns__(self, path):
        raw_phns = open(path).read()
        phns = [phn.split(' ')[-1] for phn in raw_phns.split('\n') if len(phn)]
        phns = remap(phns)
        phns = [phn for phn in phns if phn != 'sil']
        return phns
    
    def __read_txt__(self, path):
        raw_text = open(path).read()
        return raw_text.split(' ', 2)[-1]
    
    def __getitem__(self, idx):
        wav_path = self.wavs[idx]
        phn_path = wav_path.replace('.WAV.wav', '.PHN')
        txt_path = wav_path.replace('.WAV.wav', '.TXT')
        wav = read_sound(wav_path)
        phns = self.__read_phns__(phn_path)
        text = self.__read_txt__(txt_path)
        _text = text.lower()   \
            .replace('.', '')  \
            .replace("\n", '') \
            .replace('?', '')  \
            .replace(',', '')  \
            .replace(';', '')  \
            .replace(':', '')  \
            .replace('"', '')  \
            .replace('!', '')  \
            .replace('-', ' ') \
            .split(' ')
        cmu_phns = []
        
        cer_to_null = False
        for word in _text:
            if word not in cmu:
                print("word not in dict: ", word)
                cer_to_null = True
                continue
            cmu_phns.extend(cmu[word])
        cer = editdistance.eval(phns, cmu_phns)/len(phns)
        if cer_to_null:
            cer = 0
        
        _phns = single_char_encode(phns)
        _cmu_phns = single_char_encode(cmu_phns)
        return {"wav": wav, "orig_phns": phns, "orig_cmu_phns": cmu_phns, "phns": _phns, "text": text, "cmu_phns": _cmu_phns, "cer": cer}

In [7]:
# for unknown words http://www.speech.cs.cmu.edu/tools/lextool.html

import os
import IPython
from diff_match_patch import diff_match_patch
DIFFER = diff_match_patch()

def show_diff(r, t):
    html_diffs = DIFFER.diff_main(r, t)
    display(IPython.display.HTML(DIFFER.diff_prettyHtml(html_diffs)))
    
def diff(item):
    print(item["text"])
    show_diff(item["phns"], item["cmu_phns"])
    
def load_phone_map():
    with open(PHNS_MAP, 'r') as fid:
        lines = (l.strip().split() for l in fid)
        lines = [l for l in lines if len(l) == 3]
    m61_48 = {l[0] : l[1] for l in lines}
    m48_39 = {l[1] : l[2] for l in lines}
    return m61_48, m48_39

m61_48, m48_39 = load_phone_map()

def remap_48_to_39(data):
    return [m48_39[p] for p in data if p in m48_39]

def remap_61_to_48(data):
    return [m61_48[p] for p in data if p in m61_48]

# TODO: document phonems in different models/datasets
def remap(data):
    result = []
    for phn in data:
        # dx is missing from awni 39 phonemes
        if phn == 'dx':
            result.append('d')
        elif phn == 'sil': # in case we override phoneme target and use SIL symbol
            result.append(phn)
        elif phn != 'q':
            result.append(m48_39[m61_48[phn]])
    return result

def single_char_encode(phns):
    return ''.join([one_letter_encoding[phn] for phn in phns])

one_letter_encoding = {
    'aa': 'a',
    'ae': '@',
    'ah': 'A',
    'ao': 'c',
    'aw': 'W',
    'ax': 'x',
    'ay': 'Y',
    'b': 'b',
    'ch': 'C',
    'cl': '-',
    'd': 'd',
    'dh': 'D',
    'dx': 'F',
    'eh': 'E',
    'el': 'L',
    'en': 'N',
    'epi': '=',
    'er': 'R',
    'ey': 'e',
    'f': 'f',
    'g': 'g',
    'hh': 'h',
    'ih': 'I',
    'ix': 'X',
    'iy': 'i',
    'jh': 'J',
    'k': 'k',
    'l': 'l',
    'm': 'm',
    'n': 'n',
    'ng': 'G',
    'ow': 'o',
    'oy': 'O',
    'p': 'p',
    'r': 'r',
    's': 's',
    'sh': 'S',
    'sil': '_',
    't': 't',
    'th': 'T',
    'uh': 'U',
    'uw': 'u',
    'v': 'v',
    'vcl': '+',
    'w': 'w',
    'y': 'y',
    'z': 'z',
    'zh': 'Z'
}

In [8]:
ds = TimitDataset(path="/home/gazay/datasets/TIMIT")

In [9]:
item = next(iter(ds))
item["text"]

'Swing your arm as high as you can.\n'

In [None]:
# item["orig_phns"]

In [10]:
stats = {
    "len": [],
    "cer": []
}

for item in ds:
    stats["len"].append(len(item["phns"]))
    stats["cer"].append(item["cer"])

word not in dict:  motorists'
word not in dict:  morphophonemic
word not in dict:  nihilistic
word not in dict:  radiosterilization
word not in dict:  exhusband
word not in dict:  somebody'll
word not in dict:  smolderingly
word not in dict:  geocentricism
word not in dict:  unmagnified
word not in dict:  stirrin
word not in dict:  utopianism
word not in dict:  infuriation
word not in dict:  preprepared
word not in dict:  understandingly
word not in dict:  eventualities
word not in dict:  micrometeorites
word not in dict:  herdin'
word not in dict:  responsively
word not in dict:  demineralization
word not in dict:  herdin'
word not in dict:  unwaveringly
word not in dict:  cap'n
word not in dict:  mournfully
word not in dict:  andrei's
word not in dict:  autofluorescence
word not in dict:  fasciculations
word not in dict:  weatherstrip
word not in dict:  nonsystematic
word not in dict:  traditionalism
word not in dict:  chorused
word not in dict:  micrometeorite
word not in dict:  reu

In [11]:
show_diff(item["phns"], item["cmu_phns"])

In [12]:
sum(stats["cer"])/len(stats["cer"])

0.24904557704602637

In [13]:
sorted(stats["cer"])[int(len(stats["cer"])/2)]

0.25

In [14]:
stats["cer"][:10]

[0.2631578947368421,
 0.3548387096774194,
 0.23809523809523808,
 0.17857142857142858,
 0.1111111111111111,
 0.18181818181818182,
 0.1,
 0.225,
 0.25,
 0.19148936170212766]

In [None]:
cmu['spend']

In [15]:
item = ds.__getitem__(0)
diff(item)

Swing your arm as high as you can.



In [20]:
item['phns']

'swIGyRarmEzhYIzuk@n'

In [21]:
item['orig_phns']

['s',
 'w',
 'ih',
 'ng',
 'y',
 'er',
 'aa',
 'r',
 'm',
 'eh',
 'z',
 'hh',
 'ay',
 'ih',
 'z',
 'uw',
 'k',
 'ae',
 'n']

In [18]:
import numpy as np
import os
from glob import glob
from tqdm import tqdm

import sys
sys.path.append('../')
import phns

TIMIT_ROOT = "/home/gazay/datasets/TIMIT"

data = []
for text_path in glob(TIMIT_ROOT + "/*/*/*/*.TXT"):
    phns_path = text_path.replace(".TXT", ".PHN")
    text = open(text_path).read().split(" ", 2)[-1]
    _phns = open(phns_path).read().split("\n")
    _phns = [phn.split(" ")[-1] for phn in _phns if phn]
    _phns = phns.utils.remap(_phns)

    data.append({"text": text, "phns": _phns})

cers = []
for _item in tqdm(data):
    calculated_phns_variants = phns.from_text(_item["text"])
    if not calculated_phns_variants:
        continue

    best = phns.closest(_item["phns"], calculated_phns_variants)
    cers.append(best["cer"])
    break

  0%|          | 0/6300 [00:00<?, ?it/s]


In [19]:
_item

{'text': 'She had your dark suit in greasy wash water all year.\n',
 'phns': ['sil',
  'sh',
  'iy',
  'hh',
  'ae',
  'sil',
  'd',
  'y',
  'er',
  'sil',
  'd',
  'aa',
  'r',
  'sil',
  'k',
  's',
  'uw',
  'sil',
  't',
  'ih',
  'n',
  'sil',
  'g',
  'r',
  'iy',
  's',
  'iy',
  'w',
  'aa',
  'sh',
  'sil',
  'w',
  'aa',
  'd',
  'er',
  'aa',
  'l',
  'y',
  'ih',
  'er',
  'sil']}