In [162]:
import torch as torch
import os
import librosa
from matplotlib import pyplot as plt
import numpy as np
import python_speech_features as psf

## 1. Processing Data 

### Input block

In [173]:
disk_root_dir = "/Volumes/EVAN_DISK"
dataset_path = "MASC/VocalSet/FULL"
output_path_train = "MASC/VocalSet/processed/train"
output_path_test = "MASC/VocalSet/processed/test"
segment_length = 2 # seconds
silence_threshold = 0.1
n_mfcc = 13

### Constant block

In [174]:
trainset_f = ["female1","female3","female4","female5","female6","female7","female9"]
trainset_m = ["male1", "male2", "male4","male6","male7","male8","male9","male11"]
train_set = trainset_f + trainset_m
test_set = ["female2", "female8", "male3", "male5", "male10"]
arpeggio_styles = ["belt", "breathy", "fast_forte", "fast_piano", "slow_forte", "slow_piano", "straight", "vibrato", "vocal_fry"]
tasks = ["arpeggios", "long_tones", "scales"]
long_tone_styles = ["forte", "inhaled", "messa", "pp", "straight", "trill", "trillo"]
task_styles = {0:arpeggio_styles, 1:long_tone_styles, 2:arpeggio_styles,
         "arpeggios":arpeggio_styles, "long_tones":long_tone_styles, "scales":arpeggio_styles}
vowels = ["a", "e", "i", "o", "u"]
vowels_dict = {"a":0, "e":1, "i":2, "o":3, "u":4}

output_path_train = os.path.join(disk_root_dir, output_path_train)
output_path_test = os.path.join(disk_root_dir, output_path_test)

### 1.1 walk the directory structure

In [175]:
# do a walk and keep all the good examples
training_data_folders = []
dataset_path = os.path.join(disk_root_dir, dataset_path)
# lets do a walk across all the folders for the training set
for singer in train_set:
    for task in tasks:
        styles = task_styles[task]
        for style in styles:
            path = os.path.join(dataset_path, singer, task, style)
            try:
                os.listdir(path)
                training_data_folders.append(path)
            except:
                print(path, "is missing frrom the dataset")
                
# do the same for testing set
testing_data_folders = []
for singer in test_set:
    for task in tasks:
        styles = task_styles[task]
        for style in styles:
            path = os.path.join(dataset_path, singer, task, style)
            try:
                os.listdir(path)
                testing_data_folders.append(path)
            except:
                print(path, "is missing frrom the dataset")
                

/Volumes/EVAN_DISK/MASC/VocalSet/FULL/female4/long_tones/trill is missing frrom the dataset
/Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/arpeggios/vibrato is missing frrom the dataset


### 1.2 Seperate audio to segments used to train

In [176]:
import warnings
warnings.filterwarnings(action='once')
counter = 0
training_annotation_file = []
for folder in training_data_folders:
    files = os.listdir(folder)
    for file in files:
        if file[-3:] != "wav":
            continue
        # OBTAIN THE AUDIO DATA AND THE MFCC COEFFICIENTS
        try:
            label = vowels_dict[file[-5]]
            sound_arr, sr= librosa.load(os.path.join(folder, file), sr = 44100)
            sound_arr = (sound_arr-sound_arr.mean())/sound_arr.std()
        except:
            print("fails at " + os.path.join(folder, file))
            continue
        winstep = 441
        mfcc_feat = psf.mfcc(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, numcep=13)
        logfbank_feat = psf.logfbank(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, nfilt=26)
        ssc_feat = psf.ssc(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, nfilt=26)
        full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat], axis=1)
        # GENERATE A SET OF LABELS, IT CAN EITHER BE A SILENCE, OR A VOWEL
        labels = []
        for i in range(0, full_feat.shape[0]):
            seg = sound_arr[i*441:(i+1)*441]
            if np.abs(seg).mean() <= silence_threshold:
                
                labels.append(5)
            else:
                labels.append(label)
        labels = np.array(labels)
        split = np.floor(len(labels) / segment_length / 100)
        for i in range(0, int(split)):
            np.save(os.path.join(output_path_train, "{}_data.npy".format(counter)), full_feat[i*200:(i+1)*200])
            np.save(os.path.join(output_path_train, "{}_label.npy".format(counter)), labels[i*200:(i+1)*200])
            training_annotation_file.append([os.path.join(output_path_train, "{}_data.npy".format(counter)), 
                                            os.path.join(output_path_train, "{}_label.npy".format(counter))])
            counter = counter + 1
        np.save(os.path.join(output_path_train, "{}_data.npy".format(counter)), full_feat[-200:])
        np.save(os.path.join(output_path_train, "{}_label.npy".format(counter)), labels[-200:])
        training_annotation_file.append([os.path.join(output_path_train, "{}_data.npy".format(counter)), 
                                            os.path.join(output_path_train, "{}_label.npy".format(counter))])
        counter = counter + 1

        import csv
with open(os.path.join(output_path_train, 'annotations.csv'), 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    for item in range(0, len(training_annotation_file)):
        spamwriter.writerow(training_annotation_file[item])
# M.shape = [time in 0.01 sec, number of mel features]

fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female3/scales/fast_piano/scales_fast_piano_f.wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female9/arpeggios/fast_forte/arps_fast_piano_c.wav


In [177]:
counter = 0
testing_annotation_file = []
for folder in testing_data_folders:
    files = os.listdir(folder)
    for file in files:
        if file[-3:] != "wav":
            continue
        # OBTAIN THE AUDIO DATA AND THE MFCC COEFFICIENTS
        try:
            label = vowels_dict[file[-5]]
            sound_arr, sr= librosa.load(os.path.join(folder, file), sr = 44100)
            sound_arr = (sound_arr-sound_arr.mean())/sound_arr.std()
        except:
            print("fails at " + os.path.join(folder, file))
            continue
        winstep = 441
        mfcc_feat = psf.mfcc(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, numcep=13)
        logfbank_feat = psf.logfbank(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, nfilt=26)
        ssc_feat = psf.ssc(sound_arr, samplerate=sr, winlen=0.02, nfft = 2*441, nfilt=26)
        full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat], axis=1)
        # GENERATE A SET OF LABELS, IT CAN EITHER BE A SILENCE, OR A VOWEL
        labels = []
        for i in range(0, full_feat.shape[0]):
            seg = sound_arr[i*441:(i+1)*441]
            if np.abs(seg).mean() <= silence_threshold:
                
                labels.append(5)
            else:
                labels.append(label)
        labels = np.array(labels)
        split = np.floor(len(labels) / segment_length / 100)
        for i in range(0, int(split)):
            np.save(os.path.join(output_path_test, "{}_data.npy".format(counter)), full_feat[i*200:(i+1)*200])
            np.save(os.path.join(output_path_test, "{}_label.npy".format(counter)), labels[i*200:(i+1)*200])
            testing_annotation_file.append([os.path.join(output_path_test, "{}_data.npy".format(counter)), 
                                            os.path.join(output_path_test, "{}_label.npy".format(counter))])
            counter = counter + 1
        np.save(os.path.join(output_path_test, "{}_data.npy".format(counter)), full_feat[-200:])
        np.save(os.path.join(output_path_test, "{}_label.npy".format(counter)), labels[-200:])
        testing_annotation_file.append([os.path.join(output_path_test, "{}_data.npy".format(counter)), 
                                            os.path.join(output_path_test, "{}_label.npy".format(counter))])
        counter = counter + 1

        import csv
with open(os.path.join(output_path_test, 'annotations.csv'), 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    for item in range(0, len(testing_annotation_file)):
        spamwriter.writerow(testing_annotation_file[item])
# M.shape = [time in 0.01 sec, number of mel features]


  and should_run_async(code)


fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/arpeggios/fast_piano/fast_piano_arps_f.wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/scales/slow_piano/f2_scales_f_slow_piano_u(1).wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/scales/straight/f2_scales_straight_u(1).wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/scales/vibrato/f2_scales_vibrato_a(1).wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female2/scales/vocal_fry/scales_vocal_fry.wav


  if ffdec.available():
  if ffdec.available():


fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female8/arpeggios/belt/._f8_arpeggios_belt_e.wav


  if ffdec.available():
  if ffdec.available():
  if ffdec.available():
  if ffdec.available():


fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female8/arpeggios/vocal_fry/._f8_arpeggios_vocal_fry_a.wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female8/arpeggios/vocal_fry/._f8_arpeggios_vocal_fry_e.wav
fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female8/arpeggios/vocal_fry/._f8_arpeggios_vocal_fry_i.wav


  if ffdec.available():
  if ffdec.available():
  if ffdec.available():
  if ffdec.available():


fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/female8/long_tones/inhaled/._f8_long_inhaled_i.wav


  if ffdec.available():
  if ffdec.available():


fails at /Volumes/EVAN_DISK/MASC/VocalSet/FULL/male3/arpeggios/belt/._m3_arpeggios_belt_e.wav


In [178]:
test_csv = os.path.join(output_path_test, 'annotations.csv')
with open(test_csv) as file:
    str_content = file.read()
print(str_content)

/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/0_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/0_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/1_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/1_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/2_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/2_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/3_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/3_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/4_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/4_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/5_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/5_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/6_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/6_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/7_data.npy,/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/7_label.npy
/Volumes/EVAN_DISK/MASC/VocalSet/processed/test/

  and should_run_async(code)
