In [None]:
import os
import sys
import json
import logging
import multiprocessing


import gentle
import scipy.io.wavfile as sciwav

In [None]:
libri_root = '/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/dev-clean/LibriSpeech'

out_file = "/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/dev-clean/LibriSpeech/output.csv"


In [None]:
out_file = open(out_file, 'w')
for top_dir in os.listdir(libri_root):
    if top_dir == 'dev-clean':
        for speaker in os.listdir(libri_root + '/' + top_dir):
            for section in os.listdir(libri_root + '/' + top_dir + '/' + speaker):
                trans_file = libri_root + '/' + top_dir + '/' + speaker + '/' + section + '/' + \
                             speaker + '-' + section + '.trans.txt'

                with open(trans_file, 'r') as t:
                    for line in t:
                        id_, transcript = line[:-1].split(' ', 1)
                        transcript = transcript.lower()
                        audio_file_path = top_dir + '/' + speaker + '/' + section + '/' + \
                                          id_ + '.wav'

                        out_file.write('libri_' + id_ + '\t' + audio_file_path + '\t' + transcript+'\n')

out_file.close()

In [None]:
DISFLUENCIES = {'uh', 'um'}  # set of disfluencies
RESOURCES = gentle.Resources()
N_THREADS = multiprocessing.cpu_count()

logging.getLogger().setLevel("INFO")

In [None]:

def _on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))

In [None]:

def _get_key_val_pair(line):
    line_split = line[:-1].split()
    word = line_split[0]
    if word[-1] == ')':
        word = word.split('(')[0]

    word = word.lower()
    key = [word]
    val = []
    for phoneme in line_split[1:]:
        val.append(phoneme.lower())
        if phoneme[-1].isdigit():
            phoneme = phoneme[:-1]

        phoneme = phoneme.lower()
        key.append(phoneme)

    key = " ".join(key)
    val = tuple(val)
    return key, val


In [None]:
def _create_dict():
    phoneme_alignment_dict = dict()

    cmu_file = open('/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/alignment/cmudict-0.7b.txt', 'r')
    for line in cmu_file:
        key, val = _get_key_val_pair(line)
        phoneme_alignment_dict[key] = val

    return phoneme_alignment_dict




In [None]:
def align_audio(wav_path, transcript):
    with gentle.resampled(wav_path) as wavfile:
        print("starting alignment {}".format(wav_path))
        aligner = gentle.ForcedAligner(RESOURCES, transcript, nthreads=N_THREADS, disfluency=False,
                                       conservative=False, disfluencies=DISFLUENCIES)
        result = aligner.transcribe(wavfile, progress_cb=_on_progress, logging=logging)
        result_json = json.loads(result.to_json())

    return result_json

In [None]:
input_csv ="/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/dev-clean/LibriSpeech/output.csv"
phoneme_path = "/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/dataset"
output_csv = "/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/output_csv.csv"
wav_root = "/home/arunav/Desktop/8th-semester/RE/lexical-stress-detection-master/dev-clean/LibriSpeech"

In [None]:
alignment_dict = _create_dict()

in_file = open(input_csv, 'r')
out_file = open(output_csv, 'w')

for line in in_file:
    id_, wav_file, transcript = line.split('\t')
    wav_file = wav_root + '/' + wav_file
    sr, signal = sciwav.read(wav_file)
    print(transcript)
    alignment = align_audio(wav_file, transcript)

    for word in alignment['words']:
        if word['case'] != 'success':
            continue

        start_time, end_time = word['start'], word['end']
        aligned_word = word['alignedWord']
        key = [aligned_word.lower()]
        for phoneme in word['phones']:
            phone = phoneme['phone']
            key.append(phone.split('_')[0])

        key = ' '.join(key)
        phoneme_tuple = alignment_dict.get(key, ())

        if len(phoneme_tuple) == 0:
            print('word: {} not in dict, skipping...'.format(word))
            continue

        if len(phoneme_tuple) != len(word['phones']):
            print('word: {} not aligned properly, skipping...'.format(word))
            continue

        # now map phonemes and slice wav
        for i, phoneme in enumerate(word['phones']):
            phone_start = start_time
            phone_end = phone_start + phoneme['duration']
            # check if vowel phoneme
            if phoneme_tuple[i][-1].isdigit():

                file_name = id_ + '_' + aligned_word + '_' + phoneme_tuple[i] + '_' + \
                            str(int(phone_start * 1000)) + '_' + str(int(phone_end * 1000)) + '.wav'

                start_frame, end_frame = int(phone_start * sr), int(phone_end * sr)
                sciwav.write(phoneme_path + '/' + file_name, sr, signal[start_frame:end_frame])
                out_file.write(file_name + '\t' + id_ + '\t' + aligned_word + '\t' + phoneme_tuple[i] + '\n')

            start_time = phone_end

    print('done alignment and slicing for file: {}'.format(wav_file))

in_file.close()
out_file.close()

