In [1]:
import numpy as np
import json
import pickle
import os

rng = np.random.default_rng(seed=0)

In [2]:
def load_old_features(accent, file_name, old_feature_type):
    file_path = os.path.join(
        ".", accent, old_feature_type, f"{file_name}_{old_feature_type}.file"
    )

    features = []
    with open(file_path, "rb") as file:
        while True:
            try:
                features.append(pickle.load(file))
            except:
                break
    return np.concatenate(features, axis=0)


def load_old_json(accent, file_name):
    file_path = os.path.join(".", accent, f"{file_name}.json")
    # print(file_path)

    with open(file_path, "r") as file:
        json_list = [json.loads(line.strip()) for line in file.readlines()]
    return json_list


def duplicate_json(old_json_list, duplication_ratio: int):
    duplicated_list = []

    for _ in range(duplication_ratio):
        duplicated_list.extend(old_json_list)

    return duplicated_list


def duplicate_features(old_features, duplication_ratio: int):
    duplicated_features = []
    for _ in range(duplication_ratio):
        duplicated_features.append(old_features)

    return np.concatenate(duplicated_features, axis=0)


def downsample(features, json, downsample_ratio):
    assert downsample_ratio == 2
    assert len(features) == len(json)
    inds = list(range(len(json)))
    selected_inds = rng.choice(len(json), replace=False, size=len(json)//downsample_ratio)
    assert(len(selected_inds) == len(set(selected_inds)))


    downsampled_features = np.concatenate([features[ind, :].reshape(1, -1) for ind in selected_inds], axis = 0)
    downsampled_json = [json[ind] for ind in selected_inds]

    return downsampled_features, downsampled_json




In [3]:
def write_new_json(data, accent, file_name, duplication_ratio):
    file_path = os.path.join(".", accent, f"{file_name}_{duplication_ratio}rep.json")
    with open(file_path, "w") as file:
        for line in data:
            file.write(json.dumps(line))
            file.write("\n")


def write_new_features(data, accent, file_name, duplication_ratio):
    feature_type = f"39_{duplication_ratio}rep"
    dir_name = os.path.join(".", accent, feature_type)
    os.makedirs(dir_name, exist_ok=True)
    file_path = os.path.join(dir_name, f"{file_name}_{feature_type}.file")
    with open(file_path, "wb") as file:
        pickle.dump(data, file)


In [4]:
if __name__ == "__main__":
    accents = [
        "assamese_female_english",
        "manipuri_female_english",
        "gujarati_female_english",
        "kannada_male_english",
        "rajasthani_male_english",
        "tamil_male_english",
        "malayalam_male_english",
        "hindi_male_english",
    ]

    duplication_ratio = 3
    downsample_ratio = 2

    for accent in accents:
        for file_name in ["seed", "selection", "test"]:
            print(f"\n\n Start ******** {accent} ------ {file_name} ********* \n\n")
            old_features = load_old_features(accent, file_name, old_feature_type="39")
            old_json = load_old_json(accent, file_name)
            print("\nold_features shape:= ", old_features.shape, "\n old_json length:= ", len(old_json))

            downsampled_old_features, downsampled_old_json = downsample(old_features, old_json, downsample_ratio=2)
            print("\ndownsampled_old_features shape:= ", downsampled_old_features.shape, "\n downsampled_old_json length:= ", len(downsampled_old_json))

            new_json = duplicate_json(downsampled_old_json, duplication_ratio=duplication_ratio)
            new_features = duplicate_features(downsampled_old_features, duplication_ratio=duplication_ratio)
            print("\nnew_json length:= ", len(new_json), "\n new features shape:= ", new_features.shape)

            write_new_json(new_json, accent, file_name, duplication_ratio=duplication_ratio)

            write_new_features(new_features, accent, file_name, duplication_ratio=duplication_ratio)

            print(f"\n\n End ******** {accent} ------ {file_name} ********* \n\n")




 Start ******** assamese_female_english ------ seed ********* 



old_features shape:=  (50, 39) 
 old_json length:=  50

downsampled_old_features shape:=  (25, 39) 
 downsampled_old_json length:=  25

new_json length:=  75 
 new features shape:=  (75, 39)


 End ******** assamese_female_english ------ seed ********* 




 Start ******** assamese_female_english ------ selection ********* 



old_features shape:=  (5765, 39) 
 old_json length:=  5765

downsampled_old_features shape:=  (2882, 39) 
 downsampled_old_json length:=  2882

new_json length:=  8646 
 new features shape:=  (8646, 39)


 End ******** assamese_female_english ------ selection ********* 




 Start ******** assamese_female_english ------ test ********* 



old_features shape:=  (2513, 39) 
 old_json length:=  2513

downsampled_old_features shape:=  (1256, 39) 
 downsampled_old_json length:=  1256

new_json length:=  3768 
 new features shape:=  (3768, 39)


 End ******** assamese_female_english ------ test *******