In [1]:
import pandas as pd
import shutil
import os
import torchaudio
import IPython.display as ipd
import time
import torch
import scipy
import scipy, matplotlib.pyplot as plt
import IPython.display as ipd
import librosa
import librosa, librosa.display
import pandas as pd
import numpy as np
import math

In [2]:
# define the path to the valid (filtered) data frames created with "Create Valid Records From Voice Common DataFrame.ipynb"
input_tsv_filename = '<path>/<to>/train_valid.tsv'

In [3]:
SAMPLE_RATE=16000 # sample rate
MAX_DURATION=1.6 # max duration allowed for a given record

In [9]:
basefolder ='/media/thalles/81abf123-2564-42b2-acbc-86f4bb0b0ff6/home/thalles/Documents/datasets/en/clips'

In [4]:
valid_df = pd.read_csv(input_tsv_filename, sep='\t')
valid_df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0,common_voice_en_18747125.mp3,"At this time, the settlement was known as Kenn...",2,0,twenties,male,us
1,0,common_voice_en_18747126.mp3,The store now also holds many evening literary...,2,0,twenties,male,us
2,0,common_voice_en_18747127.mp3,The area included was east of the Kennebec River.,2,0,twenties,male,us
3,0,common_voice_en_18747128.mp3,The town was named after the biblical Land of ...,2,0,twenties,male,us
4,0,common_voice_en_18747129.mp3,The two books end exactly the same way.,2,0,twenties,male,us


In [5]:
print("Dataset shape:", valid_df.shape)
print("Number of unique speakers/classes:", len(np.unique(valid_df.client_id)))

Dataset shape: (3009, 8)
Number of unique speakers/classes: 257


In [6]:
def random_pitch_shift(audio, sample_rate):
    n_steps = np.random.uniform(low=-1, high=1)
    original_size = audio.shape[0]
    audio = librosa.effects.pitch_shift(audio.numpy(), sample_rate, n_steps)
    return torch.from_numpy(audio[:original_size])

In [7]:
def random_time_stretch(audio):
    speed_factor = np.random.uniform(low=0.8, high=1.2)
    original_size = audio.shape[0]
    audio = librosa.effects.time_stretch(audio.numpy(), speed_factor)
    return torch.from_numpy(audio[:original_size])

In [8]:
def _remove_silent_frames(audio, top_db):
    indices = librosa.effects.split(audio, top_db=top_db)

    trimmed_audio = []
    for index in indices:
        trimmed_audio.append(audio[index[0]: index[1]])

    return torch.cat(trimmed_audio, dim=0)

In [10]:
def timeit(function):
    def wrapper(*args, **kwargs):
        start = time.time()
        audio = function(*args, **kwargs)
        end = time.time()
        print("Ellpased time:", end-start)
        return audio
    return wrapper

In [11]:
def _prepare_input(audio, duration, sample_rate):
    # get audio duration
    audio_duration_ms = audio.shape[0]

    # get the desired duration in ms
    duration_ms = math.floor(duration * sample_rate)

    # duration: length of the cropped audio in seconds
    if audio_duration_ms <= duration_ms:
        # print("Passed duration greater than audio duration of:", audio_duration_secs)
        audio = torch.nn.functional.pad(audio, (0, duration_ms - audio_duration_ms), "constant", 0)
        return audio
    else:
        idx = np.random.randint(0, audio_duration_ms - duration_ms)
        audio = audio[idx: idx + duration_ms]

    # data augmentation
    p1 = np.random.rand()
    p2 = np.random.rand()
    if p1 < 0.5:
        audio = random_time_stretch(audio)
    elif p2 < 0.5:
        audio = random_pitch_shift(audio, sample_rate)

    return audio

In [12]:
resample = torchaudio.transforms.Resample(48000, new_freq=SAMPLE_RATE)

In [13]:
dataset = []
labels = []

# create dataset 
for client_id, record in valid_df.iterrows():

    filename = record['path']
    target = record['client_id']

    try:
        audio, sample_rate = torchaudio.load(os.path.join(basefolder, filename), normalization=True)
        audio = resample(audio)
        audio = audio.flatten()
        audio = _remove_silent_frames(audio, top_db=40)
        output = _prepare_input(audio, duration=MAX_DURATION, sample_rate=SAMPLE_RATE)
        
        dataset.append(output)
        labels.append(target)
        
    except Exception as ex:
        raise(ex)
        print("Skipping audio file:", filename)

In [14]:
print("Dataset size (number of records):", len(dataset))
print("Targets dataset size:", len(labels))
assert len(dataset) == len(labels), "Dataset and labels lists must match is length"

Dataset size (number of records): 3009
Targets dataset size: 3009


In [15]:
# save the dataset
torch.save({"data": dataset, 'targets': labels}, 
           os.path.join('pytorch_dataset', os.path.splitext(input_tsv_filename)[0] + '.pt'))

In [24]:
ipd.Audio(dataset['data'][3], rate=SAMPLE_RATE)

In [17]:
import time
import torch

start = time.time()
dataset = torch.load('/home/thalles/daitan-workspace/voice_rocognizer/train_valid.pt')
end = time.time()
print("transforms timer:", end-start)

transforms timer: 0.9649059772491455
