In [None]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
import json, os, torch, statistics, glob, librosa, pickle, torchaudio
from tqdm import tqdm
import numpy as np
import torchaudio.functional as F
import torchaudio.transforms as T

mfcc_transform = T.MFCC(
    sample_rate=22050,
    n_mfcc=39,
    melkwargs={
        "n_fft": 2048,
        "n_mels": 256,
        "hop_length": 512,
        "mel_scale": "htk",
    },
)
base_dir = "./"


In [None]:
dirs = [
    "assamese_female_english",
    "hindi_male_english",
    "kannada_male_english",
    "manipuri_female_english",
    "tamil_male_english",
    "gujarati_female_english",
    "malayalam_male_english",
    "rajasthani_male_english",
]


def extract_features(file_list, file_dir):
    file_type = file_dir.split("/")[-1].replace(".json", "")
    feature_dir = "/".join(file_dir.split("/")[:-1]) + "/39/"
    os.makedirs(os.path.dirname(feature_dir), exist_ok=True)
    feature_file = feature_dir + file_type + "_39.file"
    with open(feature_file, "wb") as f:
        for file in tqdm(file_list):
            waveform, sample_rate = torchaudio.load(file["audio_filepath"])
            mfcc_features = mfcc_transform(waveform).mean(2).detach().numpy()
            pickle.dump(mfcc_features, f)
    print("completed", file_dir)


for _dir in dirs:
    manifests_path = base_dir + _dir + "/"
    print("_" * 20)
    print(_dir)

    seed_file_dir = manifests_path + "seed.json"
    seed_file = open(seed_file_dir)
    seed_list = [json.loads(line.strip()) for line in seed_file]

    selection_file_dir = manifests_path + "selection.json"
    selection_file = open(selection_file_dir)
    selection_list = [json.loads(line.strip()) for line in selection_file]

    test_file_dir = manifests_path + "test.json"
    test_file = open(test_file_dir)
    test_list = [json.loads(line.strip()) for line in test_file]

    print("seed_file_starting")
    print(seed_file_dir)
    extract_features(seed_list, seed_file_dir)
    print(len(seed_list))
    print("seed_file_ending ...\n")
    #     break
    print("selection_file_starting")
    extract_features(selection_list, selection_file_dir)
    print(len(selection_list))
    print("selection_file_ending ...\n\n")

    print("test_file_starting")
    extract_features(test_list, test_file_dir)
    print(len(test_list))
    print("test_file_ending ...\n\n")


In [None]:
print(dirs)
dirs


In [None]:
for _dir in tqdm(dirs):
    manifests_path = base_dir + _dir + "/manifests/"
    print("_" * 20)
    print(_dir)

    seed_file_dir = manifests_path + "seed.json"
    seed_file = open(seed_file_dir)
    seed_list = [json.loads(line.strip()) for line in seed_file]

    selection_file_dir = manifests_path + "selection.json"
    selection_file = open(selection_file_dir)
    selection_list = [json.loads(line.strip()) for line in selection_file]

    test_file_dir = manifests_path + "test.json"
    test_file = open(test_file_dir)
    test_list = [json.loads(line.strip()) for line in test_file]

    print("seed_file_starting")
    print(seed_file_dir)
    extract_features(seed_list, seed_file_dir)
    print(len(seed_list))
    print("seed_file_ending ...\n")
    #     break
    print("selection_file_starting")
    extract_features(selection_list, selection_file_dir)
    print(len(selection_list))
    print("selection_file_ending ...\n\n")

    print("test_file_starting")
    extract_features(test_list, test_file_dir)
    print(len(test_list))
    print("test_file_ending ...\n\n")
