In [1]:
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
import sys
import nltk
from nltk.corpus import stopwords
import numpy as np
import shutil
from tqdm import tqdm
from collections import Counter

In [2]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [3]:
def check_and_create_dirs():
    folders_to_check = [config["es"]["exp_path"], config["es"]["out_path"]]
    for folder in folders_to_check:
        if not os.path.exists(folder):
            print("creating folder: {0:s}".format(folder))
            os.makedirs(folder)
    print("...")
        

In [4]:
def get_wav_file_list(prefix, wav_path):
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(wav_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    return wav_file_list_string

In [5]:
def create_file_lst(file_lst_fname):
    with open(file_lst_fname, "w") as out_f:
        for folder in ["train", "test"]:
            wav_file_list_string = get_wav_file_list(config["es"][folder]["lst_file_prefix"], 
                                                     config['es'][folder]['wavs'])
            out_f.write(wav_file_list_string)
        
    print("Finished writing files.lst")

In [6]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [7]:
def create_and_normalize_plps():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_fname = os.path.join(plp_path, file_id+".binary")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create PLP
        if i % 20 == 0:
            print("plp for file %s " % file_id)

        #if not os.path.exists(plp_fname):
        create_plp(wav_fname, plp_fname)

        if i % 20 == 0:
            print("normalizing plp %s" % file_id)

        #if not os.path.exists(plp_norm_fname):
        normalize_plp(plp_fname, vad_fname, plp_norm_fname)
    print("Completed!")

## Create LSH files

In [8]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [10]:
# def create_lsh_files():
#     # create lsh projection seed file
#     file_list = []
#     lsh_proj_fname = os.path.join(config["es"]["data_path"], "proj_S64xD39_seed1")
#     create_lsh_proj_file(lsh_proj_fname)
    
#     # create lsh files for both train and test sets    
#     for folder in ["train", "test"]:
#         if not os.path.exists(config['es']['lsh_path']):
#             os.makedirs(config['es']['lsh_path'])
#         if not os.path.exists(config['es']['plp_binary']):
#             os.makedirs(config['es']['plp_binary'])
        
#         sys.stderr.flush()
        
#         if folder == "train":
#             plp_key = "plp_npy"
#             plp_extn = ".plp.npy"
#         else:
#             plp_key = "plp_binary"
#             plp_extn = ".bin"
        
#         plp_files = [f for f in os.listdir(config['es'][folder][plp_key]) if f.endswith(plp_extn)]
#         with tqdm(total=len(plp_files)) as pbar:
#             for i, plp_file in enumerate(plp_files, start=1):
#                 plp_file_base = plp_file.replace(plp_extn, "")
#                 plp_fname = (os.path.join(config['es'][folder][plp_key], plp_file))
#                 plp_binary_fname = (os.path.join(config['es']['plp_binary'], 
#                                           "{0:s}.binary".format(plp_file_base)))
#                 plp_norm_fname = (os.path.join(config['es']['plp_binary'], 
#                                           "{0:s}.std.binary".format(plp_file_base)))
#                 vad_fname = (os.path.join(config['es'][folder]['vad_path'], 
#                                           "{0:s}.vad".format(plp_file_base)))
#                 lsh_fname = (os.path.join(config['es']['lsh_path'], 
#                                           "{0:s}.std.lsh64".format(plp_file_base)))
#                 #print(plp_file, plp_fname, plp_binary_fname, vad_fname, lsh_fname)
#                 if folder == "train":
#                     x = np.load(plp_fname)
#                     y = x.ravel()
#                     y.tofile(plp_binary_fname)
#                 else:
#                     shutil.copyfile(plp_fname, plp_binary_fname)
                
#                 normalize_plp(plp_binary_fname, vad_fname, plp_norm_fname)
                
#                 # create lsh
#                 create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
#                 file_list.append(plp_file_base)

#                 # update progress
#                 pbar.set_description("processing {0:s}".format(plp_file_base))
#                 pbar.update(1)
#     print("completed set {0:s}".format(folder))
#     return file_list

In [11]:
def create_lsh_files():
    # create lsh projection seed file
    file_list = []
    lsh_proj_fname = os.path.join(config["es"]["data_path"], "proj_S64xD39_seed1")
    create_lsh_proj_file(lsh_proj_fname)
    
    # create lsh files for both train and test sets    
    for folder in ["train", "test"]:
        if not os.path.exists(config['es']['lsh_path']):
            os.makedirs(config['es']['lsh_path'])
        if not os.path.exists(config['es']['plp_binary']):
            os.makedirs(config['es']['plp_binary'])
        
        sys.stderr.flush()
        
        wav_files = [f for f in os.listdir(config['es'][folder]['wavs']) if f.endswith(".wav")]
        with tqdm(total=len(wav_files)) as pbar:
            for i, wav_file in enumerate(wav_files, start=1):
                wav_file_base = wav_file.replace(".wav", "")
                wav_file_fname = os.path.join(config['es'][folder]['wavs'], wav_file)
                plp_binary_fname = (os.path.join(config['es']['plp_binary'], 
                                          "{0:s}.binary".format(wav_file_base)))
                plp_norm_fname = (os.path.join(config['es']['plp_binary'], 
                                          "{0:s}.std.binary".format(wav_file_base)))
                vad_fname = (os.path.join(config['es'][folder]['vad_path'], 
                                          "{0:s}.vad".format(wav_file_base)))
                lsh_fname = (os.path.join(config['es']['lsh_path'], 
                                          "{0:s}.std.lsh64".format(wav_file_base)))
                
                create_plp(wav_file_fname, plp_binary_fname)
                normalize_plp(plp_binary_fname, vad_fname, plp_norm_fname)
                
                # create lsh
                create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
                file_list.append(wav_file_base)

                # update progress
                pbar.set_description("processing {0:s}".format(wav_file_base))
                pbar.update(1)
    print("completed set {0:s}".format(folder))
    return file_list

## Create ZRTools discovery command files

In [12]:
def read_file_list():
    file_list = {"train":[], "test":[]}
    # read training files
    with open(config["es"]["filemap"], "r") as in_f:
        for line in in_f:
            fname = line.strip()
            if not os.path.exists("{0:s}.plp.npy".format(os.path.join(config['es']['train']['plp_npy'], 
                                                                      fname))):
                print("file = {0:s} not found".format(fname))
            else:
                file_list["train"].append(fname)
    # read test files
    plp_files = [f for f in os.listdir(config['es']["test"]["plp_binary"]) if f.endswith(".bin")]
    for i, plp_file in enumerate(plp_files, start=1):
        plp_file_base = plp_file.replace(".bin", "")
        file_list["test"].append(plp_file_base)
    return file_list

In [13]:
def create_files_base(file_list):
    file_list = read_file_list()
    with open(config["es"]["lst_file_base"], "w") as out_f:
        for wav_file in (file_list["train"] + file_list["test"]):
            out_f.write(wav_file+'\n')
    print("Generated files.base")

In [14]:
def create_discovery_cmd_scripts(num_splits=1):
    exp_path = config["es"]["exp_path"]
    file_list = read_file_list()
    wav_file_list = file_list["train"] + file_list["test"]
    exp_name = config["es"]["exp_name"]
    
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in range(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in range(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in range(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

# Read transcripts, and translations into a dictionary

In [15]:
def read_en_translations(file_list):
    en_words = {}
    with open(config["es"]["trans_file"], "r") as in_f:
        for fname, en_line in zip(file_list, in_f):
            en_words[fname] = en_line.strip()
    print("finished reading translations")
    pickle.dump(en_words, open(config["es"]["en_words"], "wb"))
    return en_words
    

## Check English translations

In [18]:
def check_english_translations():
    en_w = []
    [en_w.extend(w.split()) for w in list(en_words.values())]
    en_words_freq = Counter(en_w)
    print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])
    print([(w,f) for w, f in en_words_freq.items() if "'" in w])

### Main

In [22]:
check_and_create_dirs()
create_file_lst(config["es"]["lst_file"])
file_list = create_lsh_files()
# file_list = read_file_list()
# create_files_base(file_list)
# create_discovery_cmd_scripts(num_splits=15)
# en_words = read_en_translations(file_list["train"])

processing 1.085:   1%|          | 4/434 [00:00<00:11, 36.96it/s]

...
Finished writing files.lst


processing 1.175: 100%|██████████| 434/434 [00:10<00:00, 41.92it/s]
processing 1: 100%|██████████| 10/10 [00:11<00:00,  1.05s/it]

completed set test





In [None]:
def main():
#     check_and_create_dirs()
#     create_file_lst(config["es"]["lst_file"])
#     file_list = create_lsh_files()
#     file_list = read_file_list()
    create_files_base(file_list)
    create_discovery_cmd_scripts(num_splits=15)
#     en_words = read_en_translations(file_list["train"])
    print("maining")

if __name__ == "__main__":
    main()

In [None]:
def generate_wavs():
    train_test_str = "test"
    dual_wav_path = "../ainu/ainu-{0:s}-wavs/".format(train_test_str)
    mono_wav_path = "../ainu/ainu-{0:s}-mono-wavs/".format(train_test_str)
    mono_8k_wav_path = "../ainu/{0:s}-wavs/".format(train_test_str)
    
    for wav_file in [f for f in os.listdir(dual_wav_path) if f.endswith(".wav")]:
        #mono_wav_fname = wav_file.replace(".mp3.SOX-CONVERTED.wav", ".wav")
        mono_wav_fname = os.path.join(mono_wav_path, wav_file)
        subprocess.call([config['base']["sox"], os.path.join(dual_wav_path, wav_file), 
                         mono_wav_fname, "remix", "1-2"])
    
    for wav_file in [f for f in os.listdir(mono_wav_path) if f.endswith(".wav")]:
        in_wav_fname = os.path.join(mono_wav_path, wav_file)
        mono_8k_wav_fname = os.path.join(mono_8k_wav_path, wav_file)

        subprocess.call([config['base']["sox"], "-t", "wav", 
                         in_wav_fname, "-t", "wav", "-e", "signed-integer", 
                          "-b", "16", "-c", "1", "-r", "8000", "--no-dither", 
                         mono_8k_wav_fname])

In [None]:
# wavs = []
# for f in os.listdir(config["es"]["train"]["wavs"]):
#     wavs.append(os.path.join(config["es"]["train"]["wavs"], f))

In [None]:
# len(wavs)

In [None]:
# Audio(wavs[234])