In [1]:
import sys, os
import torch
import librosa
import numpy as np
import pandas as pd
from torch import Tensor
from scipy.io import wavfile
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
data_root = "./dataset"
train = pd.read_csv(os.path.join(data_root, "train.csv"))
test = pd.read_csv(os.path.join(data_root, "sample_submission.csv"))


In [31]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [34]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001,
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        #print(fname)
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
        if i%100==0: print(fname)
    return X

In [36]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

In [53]:
print(mean[:,0,0])
print(np.mean(mean,axis=1)[:,0])

[-4.71195017e+02  8.67218923e+01 -8.68484048e+00  1.10992938e+01
 -1.13603528e+00  6.67262492e+00 -4.58815642e-01  2.70545461e+00
 -6.28837325e-01  1.13116347e+00 -2.75643224e-01  7.98913112e-01
  9.24784837e-02  6.11031499e-01  9.09000137e-02  6.07228257e-01
  1.40003216e-02  6.77890730e-01  1.93926683e-01  3.94245631e-01
  8.23013944e-02  1.73263664e-01 -9.86073703e-02  2.38401622e-01
  9.51442925e-02  3.17948966e-01  1.08027023e-01  4.29151189e-01
 -6.16326547e-02  3.10527673e-01 -1.57109732e-01  1.34655422e-01
 -1.05907219e-01  2.53557759e-01 -2.18417807e-01  2.03774135e-01
 -4.94232983e-02  1.73134965e-01 -2.83320867e-02  1.83026096e-01]
[-4.66033445e+02  8.23644621e+01 -1.38170029e+01  1.16376241e+01
 -2.57346918e+00  6.90062386e+00 -1.49834213e+00  2.88186548e+00
 -1.59544513e+00  6.68437534e-01 -1.10905199e+00  4.43983283e-01
 -2.79526323e-01  4.82756468e-01 -2.17242062e-01  6.52351035e-01
 -1.07604621e-01  8.49665019e-01  4.35129813e-01  6.16955592e-01
  1.01423591e-01  3.6062

In [55]:
print(std[:,0,0])
print(np.mean(std,axis=1)[:,0])

[221.14725199  87.56803837  39.99952489  24.405279    19.13094649
  14.78184575  13.38119501  11.81655646  10.88142122  10.48501795
   9.70191829   8.91325998   8.91491658   8.31850645   8.0877303
   7.79807173   7.45621931   7.19477697   6.97909748   6.78732797
   6.73065104   6.63933903   6.65777394   6.68400011   6.55432491
   6.29839365   6.04186207   5.6929627    5.52577324   5.46193352
   5.41468163   5.34608553   5.26354739   5.17490999   5.20304415
   5.38430372   5.50170278   5.35272587   4.97838486   4.74544063]
[203.28762364  75.79474719  47.82216709  27.83725255  22.54973322
  17.48948832  16.75118865  14.35648827  13.64101721  13.23492617
  12.62152191  12.06261458  11.9915152   11.92912788  11.6167816
  11.59190331  11.251212    10.82318369  10.44781297  10.16843015
  10.21739503  10.42982771  10.87505503  11.0720104   11.24129758
  10.84473982  10.22632294   9.5676512    9.06891272   8.54451143
   8.51562843   8.54343824   8.15479196   8.00608341   8.40899866
   9.298686

In [72]:
std = std[:,:,0]
std.shape

(40, 173)

In [73]:
np.savez('mean_std.npz', mean=mean,std=std)

In [2]:
import numpy as np
read = np.load('mean_std.npz')

In [3]:
read['mean'].shape

(40, 173)

In [4]:
a = read['mean']

In [15]:
b=np.concatenate((a,a),1)[:,0:187]

In [22]:
a=np.zeros([1,40,40])



In [34]:
a.reshape(-1,40,40).shape

(1, 40, 40)

In [7]:
import numpy as np
import pandas as pd
import librosa as lr
import os

def trim_silence(audio, threshold=0.005, frame_length=512):
    '''Removes silence at the beginning and end of a sample.'''
    if audio.size < frame_length:
        frame_length = audio.size
    energy = lr.feature.rmse(audio, frame_length=frame_length)
    frames = np.nonzero(energy > threshold)
    indices = lr.core.frames_to_samples(frames)[1]
    # Note: indices can be an empty array, if the whole audio was silence.
    return audio[indices[0]:indices[-1]] if indices.size else audio
def quantize_data(data, classes):
    mu_x = mu_law_encoding(data, classes)
    bins = np.linspace(-1, 1, classes)
    quantized = np.digitize(mu_x, bins) - 1
    return quantized
def mu_law_encoding(data, mu):
    mu_x = np.sign(data) * np.log(1 + mu * np.abs(data)) / np.log(mu + 1)
    return mu_x

def process_data1():
    sampling_rate = 16000
    mono=True
    classes = 256
    dtype = torch.FloatTensor
    ltype = torch.LongTensor
    data_root = "dataset"
    data_dir = os.path.join(data_root, "audio_train")
    csv_file = pd.read_csv(os.path.join(data_root, "train.csv"))
    LABELS = list(csv_file.label.unique())
    label_idx = {label: i for i, label in enumerate(LABELS)}
    print("create dataset from audio files at", data_dir)
    dataset_file = os.path.join(data_root, "train.npy")
    processed_files = []
    for i, filename in enumerate(csv_file["fname"]):
        #print("  processed " + str(i) + " of " + str(csv_file.shape[0]) + " files")
        file_data, _ = lr.load(path=os.path.join(data_dir, filename),
                               sr=sampling_rate,
                               mono=mono)
        file_data = trim_silence(file_data)
        # 把音频离散化为256类
        quantized_data = quantize_data(file_data, classes).astype(dtype)
        label = label_idx[csv_file["label"][i]].astype(ltype)
        processed_files.append((quantized_data, label))
                               
    np.savez(dataset_file, *processed_files)
    print('complete')

In [8]:
process_data1()

create dataset from audio files at dataset/audio_train


complete


In [None]:
item_length = model.receptive_field + model.output_length -1    #11138
step_length = 5000
target_length = model.output_length #5000
data_root = "dataset"
data_dir = os.path.join(data_root, "audio_train")
dataset_file = os.path.join(data_root, "train.npy.npz")
dataset_clip_file = os.path.join(data_root, "train_clip.npy")
data = np.load(dataset_file, mmap_mode='r')
processed_files =[]
for i, (file_data, label) in enumerate(data):
    file_len = len(file_data)
    while(file_len>0):
        if file_len < item_length and file_len>=512:
            s = file_data
            for i in range(int(item_length/file_len)):
                s=np.concatenate((s,file_data))
            file_data= s[:item_length]
            processed_files.append((file_data, label))
            file_data = file_data[step_length:]
            file_len = -1
        elif file_len >= item_length:
            processed_files.append((file_data[:item_length], label))
            file_data = file_data[step_length:]
            file_len = len(file_data)
        else : file_len = -1
        


In [29]:
data_root = "dataset"
data_dir = os.path.join(data_root, "audio_train")
dataset_file = os.path.join(data_root, "train.npy.npz")
data = np.load(dataset_file, mmap_mode='r')

In [38]:
minn = 1e10
maxx = -1
for i, key in enumerate(data.files):

    a, l = data[key]
    c=len(a)
    #print(c)
    if c < 6000: print(c,i)
    #if maxx < c: maxx = c
    #if i==10: break
print( maxx)

2048 2


4096 159


4096 172


3072 224


2560 363


2048 375


4608 806


2048 1050


1024 1141


5120 1304


0 1784


0 1913


5120 2387


5632 2727


5632 2991


0 3860


5632 4446


4096 4463


5632 4857


5120 5700


4096 6346


5632 6531


3072 6585


0 6659


0 7358


5120 7447


5120 8305


0 8374


5632 8451


5120 8467


5632 8994


512 9264


-1


In [35]:
len(data.files)

9473

max is 