In [6]:
import subprocess as sp
import os
import pandas as pd
import math
import random


In [7]:
def clips_size_kB(path):
    try:
        return int(sp.run("du -s " + path, stdout=sp.PIPE, shell=True).stdout.decode("utf-8").split("\t")[0])
    except ValueError:
        pass
    return 0

def sort_age_gen_occ(df):
    age_gen_occ_dict = {}
    for age in dict(df.age.value_counts()):
        for gender in ["male", "female"]:
            age_gen_occ_dict[(age, gender)] = len(df[(df.age == age) & (df.gender == gender)])
    age_gen_occ_list = sorted(age_gen_occ_dict, key=age_gen_occ_dict.get)
    return age_gen_occ_dict, age_gen_occ_list

def print(p=True, *args, **kwargs):
    if type(p) == bool:
        if p:
            return __builtins__.print(*args, **kwargs)
        return
    return __builtins__.print(p, *args, **kwargs)


In [8]:
def downsample_language(path, new_size_kB, extension=".wav", p=False):
    path = '/'.join(path.split('/')) + '/'
    old_size_kB = clips_size_kB(path + "clips")
    df = pd.read_csv(path + "validated.tsv", sep="\t", dtype=object)
    if old_size_kB <= new_size_kB:
        print(p, "\nthe dataset cannot be downsampled to this size because it is not large enough")
        return [clip.split('.')[0] + '.' + extension.split('.')[-1] for clip in list(df.path)]
    
    df = pd.read_csv(path + "validated.tsv", sep="\t", dtype=object)
    clips_needed = int((new_size_kB / old_size_kB) * len(df))
    age_dict = dict(df.age.value_counts())
    clips_per_group = math.ceil(clips_needed / (len(age_dict) * 2))
    
    age_gen_occ_dict, age_gen_occ_list = sort_age_gen_occ(df)
    
    selected_clips = []
    i = 0
    print(p, "data length:", len(df), "\t\tclips needed:", clips_needed)
    for age, gender in age_gen_occ_list:

        i += 1
        current = df[(df.age == age) & (df.gender == gender)]
        now_selected = []
        if len(current) < clips_per_group:
            print(p, "\tclips_per_group was", clips_per_group)
            clips_needed -= len(current)
            try:
                clips_per_group = math.ceil(clips_needed / ((len(age_dict) * 2) - i))
            except ZeroDivisionError:
                clips_per_group = clips_needed
            now_selected = current
            selected_clips += list(now_selected.path)
            print(p, "\tnot enough clips for", age, gender, "so appending", len(now_selected))
            print(p, "\tclips_per_group now is", clips_per_group)
        else:
            now_selected = current.sample(clips_per_group)
            selected_clips += list(now_selected.path)
            clips_needed -= clips_per_group
            print(p, "\tenough clips for", age, gender, \
                  "(" + str(age_gen_occ_dict[(age, gender)]), "total), so appending", len(now_selected))
        df.drop(now_selected.index, inplace=True)
        print(p, "\ndata length:", len(df), "\t\tclips needed:", clips_needed)
            
    if clips_needed > 0:
        if len(df) < clips_needed:
            print(p, "\nnot enough labeled clips found, so appending", clips_needed, "unlabeled ones")
            selected_clips += list(df.sample(clips_needed).path)
        else:
            print(p, "\nthe dataset cannot be downsampled to this size because it is not large enough")
            selected_clips += list(df.path)
            
    return [clip.split('.')[0] + '.' + extension.split('.')[-1] for clip in selected_clips]
        
    
#len(downsample_language("speech_data/Dutch", 36000))


In [9]:
def split_train_test_validate(data, weights):
    if type(weights) != dict:
        try:
            weights = {"train": weights[0], "test": weights[1], "validate": weights[2]}
        except:
            print("Geen iterable")
            return
        
    weights = {k:w/sum(weights.values()) for k, w in weights.items()}
    random.shuffle(data)
    
    train = data[:math.ceil(len(data) * weights["train"])]
    test = data[math.ceil(len(data) * weights["train"]):\
                math.ceil(len(data) * (weights["train"] + weights["test"]))]
    validate = data[math.ceil(len(data) * (weights["train"] + weights["test"])):]
    
    return {"train":train, "test":test, "validate":validate}
    
#len(split_train_test_validate(downsample_language("Data/nl", 3000), [7, 2, 1])["train"])


In [10]:
def downsample_wrapped(path, new_size_kB, extension=".wav", p=False, destination_name="/general"):
    path = '/'.join(path.split('/')) + '/'
    destination = "Downsampled/" + path.split('/')[-2] + '/' + \
                '/'.join([c for c in destination_name.split('/') if c]) 
    
    sp.run("rm -r " + destination, shell=True)
    sp.run("mkdir Downsampled", shell=True)
    sp.run("mkdir " + '/'.join(destination.split('/')[:2]), shell=True)
    sp.run("mkdir " + destination, shell=True)

    samples = downsample_language(path, new_size_kB, extension, p)
    for sample in samples:
        sp.run("cp "+ path + "clips/" + sample + ' ' + destination, shell=True)
    
#downsample_wrapped("Data/nl", 3000, extension=".mp3")


In [11]:
def downsample_split_wrapped(path, new_size_kB, split_weights, extension=".wav", p=False):    
    path = '/'.join(path.split('/')) + '/'
    
    sp.run("mkdir Downsampled", shell=True)
    sp.run("mkdir Downsampled/" + path.split('/')[-2], shell=True)

    samples = downsample_language(path, new_size_kB, extension, p)
    splits = split_train_test_validate(samples, split_weights)

    for split in splits:
        destination = "Downsampled/" + path.split('/')[-2] + "/" + split
        sp.run("rm -r " + destination, shell=True)
        sp.run("mkdir " + destination, shell=True)
        for sample in splits[split]:
            copy = sp.run("cp "+ path + "clips/" + sample + ' ' + destination, shell=True)
            print(p, "completed copying", sample, "to", destination, "with return code", copy.returncode)
            
#downsample_split_wrapped("Data/nl", 3000, [7, 2, 1], extension=".mp3", p=True)


# Downsampling and splitting

In [12]:
downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/en", 200000, [70,15,15], extension=".mp3")

downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/es", 200000, [70,15,15], extension=".mp3")

downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/it", 200000, [70,15,15], extension=".mp3")

downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/nl", 200000, [70,15,15], extension=".mp3")

KeyboardInterrupt: 