In [1]:
import subprocess as sp
import os
import pandas as pd
import math
import random


In [65]:
def clips_size_kB(path):
    try:
        return int(sp.run("du -s " + path, stdout=sp.PIPE, shell=True).stdout.decode("utf-8").split("\t")[0])
    except ValueError:
        pass
    return 0

def get_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return round(total_size/1024)

def sort_age_gen_occ(df):
    age_gen_occ_dict = {}
    for age in dict(df.age.value_counts()):
        for gender in ["male", "female"]:
            age_gen_occ_dict[(age, gender)] = len(df[(df.age == age) & (df.gender == gender)])
    age_gen_occ_list = sorted(age_gen_occ_dict, key=age_gen_occ_dict.get)
    return age_gen_occ_dict, age_gen_occ_list

def print(p=True, *args, **kwargs):
    if type(p) == bool:
        if p:
            return __builtins__.print(*args, **kwargs)
        return
    return __builtins__.print(p, *args, **kwargs)


In [56]:
def downsample_language(path, new_size_kB, extension=".wav", p=False):
    path = '/'.join(path.split('/')) + '/'
    old_size_kB = get_size(path + "clips")
    print(old_size_kB)
    df = pd.read_csv(path + "validated.tsv", sep="\t", usecols=["path", "age", "gender"], dtype={"path": str, "age": str, "gender": str}, na_values=['NA'])

    if old_size_kB <= new_size_kB:
        print(p, "\nthe dataset cannot be downsampled to this size because it is not large enough")
        return [clip.split('.')[0] + '.' + extension.split('.')[-1] for clip in list(df.path)]

    clips_needed = int((new_size_kB / old_size_kB) * len(df))
    age_dict = dict(df.age.value_counts())
    clips_per_group = math.ceil(clips_needed / (len(age_dict) * 2))
    
    age_gen_occ_dict, age_gen_occ_list = sort_age_gen_occ(df)
    
    selected_clips = []
    i = 0
    print(p, "data length:", len(df), "\t\tclips needed:", clips_needed)
    for age, gender in age_gen_occ_list:

        i += 1
        current = df[(df.age == age) & (df.gender == gender)]
        now_selected = []
        if len(current) < clips_per_group:
            print(p, "\tclips_per_group was", clips_per_group)
            clips_needed -= len(current)
            try:
                clips_per_group = math.ceil(clips_needed / ((len(age_dict) * 2) - i))
            except ZeroDivisionError:
                clips_per_group = clips_needed
            now_selected = current
            selected_clips += list(now_selected.path)
            print(p, "\tnot enough clips for", age, gender, "so appending", len(now_selected))
            print(p, "\tclips_per_group now is", clips_per_group)
        else:
            now_selected = current.sample(clips_per_group)
            selected_clips += list(now_selected.path)
            clips_needed -= clips_per_group
            print(p, "\tenough clips for", age, gender, \
                  "(" + str(age_gen_occ_dict[(age, gender)]), "total), so appending", len(now_selected))
        df.drop(now_selected.index, inplace=True)
        print(p, "\ndata length:", len(df), "\t\tclips needed:", clips_needed)
            
    if clips_needed > 0:
        if len(df) < clips_needed:
            print(p, "\nnot enough labeled clips found, so appending", clips_needed, "unlabeled ones")
            selected_clips += list(df.sample(clips_needed).path)
        else:
            print(p, "\nthe dataset cannot be downsampled to this size because it is not large enough")
            selected_clips += list(df.path)
            
    return [clip.split('.')[0] + '.' + extension.split('.')[-1] for clip in selected_clips]
        
    
#len(downsample_language("speech_data/Dutch", 36000))


In [14]:
def split_train_test_validate(data, weights):
    if type(weights) != dict:
        try:
            weights = {"train": weights[0], "test": weights[1], "validate": weights[2]}
        except:
            print("Geen iterable")
            return
        
    weights = {k:w/sum(weights.values()) for k, w in weights.items()}
    random.shuffle(data)
    
    train = data[:math.ceil(len(data) * weights["train"])]
    test = data[math.ceil(len(data) * weights["train"]):\
                math.ceil(len(data) * (weights["train"] + weights["test"]))]
    validate = data[math.ceil(len(data) * (weights["train"] + weights["test"])):]
    
    return {"train":train, "test":test, "validate":validate}
    
#len(split_train_test_validate(downsample_language("Data/nl", 3000), [7, 2, 1])["train"])


In [62]:
def downsample_wrapped(path, new_size_kB, extension=".wav", p=False, destination_name="/general"):
    path = '/'.join(path.split('/')) + '/'
    destination = "Downsampled/" + path.split('/')[-2] + '/' + \
                '/'.join([c for c in destination_name.split('/') if c]) 
    
    sp.run("rm -r " + destination, shell=True)
    sp.run("mkdir Downsampled", shell=True)
    sp.run("mkdir " + '/'.join(destination.split('/')[:2]), shell=True)
    sp.run("mkdir " + destination, shell=True)

    samples = downsample_language(path, new_size_kB, extension, p)
    for sample in samples:
        sp.run("cp "+ path + "clips/" + sample + ' ' + destination, shell=True)
    
#downsample_wrapped("Data/nl", 3000, extension=".mp3")


In [68]:
def downsample_split_wrapped(path, new_size_kB, split_weights, extension=".wav", p=False):    
    path = '/'.join(path.split('/')) + '/'
    
    sp.run("mkdir Downsampled", shell=True)
    sp.run("mkdir Downsampled/" + path.split('/')[-2], shell=True)

    samples = downsample_language(path, new_size_kB, extension, p)
    splits = split_train_test_validate(samples, split_weights)

    for split in splits:
        destination = "Downsampled/" + path.split('/')[-2] + "/" + split
        sp.run("rm -r " + destination, shell=True)
        sp.run("mkdir " + destination, shell=True)
        for sample in splits[split]:
            copy = sp.run("cp "+ path + "clips/" + sample + ' ' + destination, shell=True)
            print(p, "completed copying", sample, "to", destination, "with return code", copy.returncode)
            
#downsample_split_wrapped("Data/nl", 3000, [7, 2, 1], extension=".mp3", p=True)


In [19]:
test = pd.read_csv("E:/Users/Remco/Documents/UvA/Scriptie/Data/nl/" + "validated.tsv", sep="\t")
display(test)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0d709133bf209da7f0164653b6e5f9aee9d059ffaf7686...,common_voice_nl_17699535.mp3,De Aboriginals zijn de oorspronkelijke bewoner...,2,0,fifties,male,netherlands
1,0fca93407be6d482019f2463e60fbafdf598a82517e63c...,common_voice_nl_17694848.mp3,Mijn toetsenbord zit vol stof.,2,1,,,
2,175d4117110538cc68a8a0157a7f0a681f3e74fbe37e62...,common_voice_nl_18441136.mp3,Ze had de bank beschadigd met haar skateboard.,2,0,,,
3,2bdc4ac33c994aad2f21339eb4b972e1bf1847a67a86c3...,common_voice_nl_19841421.mp3,Waarom belde je me niet even?,2,0,fourties,male,netherlands
4,3657f0eda48d14c9ae3f324124983c69eb1c973f6fdd34...,common_voice_nl_19573544.mp3,De kinderen moesten zuchten; ze moesten nog ee...,2,0,,,
...,...,...,...,...,...,...,...,...
22949,fd5bc33e80bea96075a41af4e09f4555105bc7dd282236...,common_voice_nl_18323903.mp3,Ze droeg een cocktailjurk om naar het gala te ...,2,0,fifties,male,netherlands
22950,fd5bc33e80bea96075a41af4e09f4555105bc7dd282236...,common_voice_nl_18323904.mp3,Ik vertrek stipt om twee uur.,2,1,fifties,male,netherlands
22951,fd5bc33e80bea96075a41af4e09f4555105bc7dd282236...,common_voice_nl_18323905.mp3,De leerkracht gaf iedere week een toets.,2,0,fifties,male,netherlands
22952,fd5bc33e80bea96075a41af4e09f4555105bc7dd282236...,common_voice_nl_18323907.mp3,Is vijftig meter de Olympische afstand?,2,0,fifties,male,netherlands


# Downsampling and splitting

In [69]:
downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/nl", 200000, [70,15,15], extension=".mp3", p=True)

#downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/en", 200000, [70,15,15], extension=".mp3")

#downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/es", 200000, [70,15,15], extension=".mp3")

#downsample_split_wrapped("E:/Users/Remco/Documents/UvA/Scriptie/Data/it", 200000, [70,15,15], extension=".mp3")

942814
data length: 22954 		clips needed: 4869
	clips_per_group was 406
	not enough clips for teens female so appending 0
	clips_per_group now is 443

data length: 22954 		clips needed: 4869
	clips_per_group was 443
	not enough clips for fifties female so appending 14
	clips_per_group now is 486

data length: 22940 		clips needed: 4855
	clips_per_group was 486
	not enough clips for thirties female so appending 15
	clips_per_group now is 538

data length: 22925 		clips needed: 4840
	clips_per_group was 538
	not enough clips for sixties female so appending 49
	clips_per_group now is 599

data length: 22876 		clips needed: 4791
	clips_per_group was 599
	not enough clips for fourties female so appending 50
	clips_per_group now is 678

data length: 22826 		clips needed: 4741
	clips_per_group was 678
	not enough clips for sixties male so appending 245
	clips_per_group now is 750

data length: 22581 		clips needed: 4496
	clips_per_group was 750
	not enough clips for twenties female so appendi

KeyboardInterrupt: 