In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import glob
import librosa
import librosa.display
from tqdm import tqdm_notebook as tqdm


import warnings
warnings.filterwarnings('ignore')

In [2]:
train_dir = '../input/birdsong-recognition/train_audio'

In [3]:
BIRD_CODE = {
    'aldfly': 0, 'ameavo': 1, 'amebit': 2, 'amecro': 3, 'amegfi': 4,
    'amekes': 5, 'amepip': 6, 'amered': 7, 'amerob': 8, 'amewig': 9,
    'amewoo': 10, 'amtspa': 11, 'annhum': 12, 'astfly': 13, 'baisan': 14,
    'baleag': 15, 'balori': 16, 'banswa': 17, 'barswa': 18, 'bawwar': 19,
    'belkin1': 20, 'belspa2': 21, 'bewwre': 22, 'bkbcuc': 23, 'bkbmag1': 24,
    'bkbwar': 25, 'bkcchi': 26, 'bkchum': 27, 'bkhgro': 28, 'bkpwar': 29,
    'bktspa': 30, 'blkpho': 31, 'blugrb1': 32, 'blujay': 33, 'bnhcow': 34,
    'boboli': 35, 'bongul': 36, 'brdowl': 37, 'brebla': 38, 'brespa': 39,
    'brncre': 40, 'brnthr': 41, 'brthum': 42, 'brwhaw': 43, 'btbwar': 44,
    'btnwar': 45, 'btywar': 46, 'buffle': 47, 'buggna': 48, 'buhvir': 49
 
}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [4]:
 def split_sound(row_number):
    """Returns the sound array, sample rate and
    x_split = intervals where sound is louder than top db
    """
    species, audio_path = get_audio_path(row_number)
    x , sr = librosa.load(audio_path)
    db = librosa.core.amplitude_to_db(y)
    mean_db = np.abs(db).mean()
    std_db = db.std()
    y_split = librosa.effects.split(y=y, top_db = mean_db - std_db)
    return x, sr, x_split

In [5]:
def remove_silence(clip):
    """Removes silence from clip
    """
    sound, sr, intervals=y, sr, y_split
    sound, sr, intervals = split_sound(clip)
    silence_removed = []
    for inter in intervals:
        silence_removed.extend(sound[inter[0]:inter[1]])
    silence_removed = np.array(silence_removed)
    return silence_removed, sr

# Extract Feature using MFCC()

In [6]:
def mfcc_extract(filename):
    try:
        
        y, sr  = librosa.load(filename, sr = 44100)
        db = librosa.core.amplitude_to_db(y)
        mean_db = np.abs(db).mean()
        std_db = db.std()
        y_split = librosa.effects.split(y=y, top_db = mean_db - std_db)
        silence_removed = []
        for interval in y_split:
            silence_removed.extend(y[interval[0]:interval[1]])
        silence_removed = np.array(silence_removed) 
        mfcc = np.mean(librosa.feature.mfcc(y=silence_removed, sr=sr, n_mfcc=39, n_fft=int(0.02*sr),hop_length=int(0.01*sr)).T,axis=0) 
        return mfcc
    except:
        return

In [7]:
def parse_audio_files(parent_dir, sub_dirs):
    labels = []
    features = []
    for label, sub_dir in (tqdm(INV_BIRD_CODE.items())):
        for fn in glob.glob(os.path.join(parent_dir,sub_dir,"*.mp3")):
            features.append(mfcc_extract(fn))
            labels.append(label)
    return features, labels

In [8]:
%%time

train_cat_dirs = glob.glob(train_dir+'/*')
train_cat = []
for cat_dir in train_cat_dirs:
    tmp = cat_dir.split('/')[-1]
    train_cat.append(tmp)
print('the number of kinds:', len(train_cat))

class_num = len(train_cat)
features, labels = parse_audio_files(train_dir, train_cat)


the number of kinds: 264


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


CPU times: user 2h 53min 18s, sys: 11min 29s, total: 3h 4min 47s
Wall time: 2h 51min 41s


In [9]:
print(len(features))
features=np.asarray(features)
labels=np.asarray(labels)

4174


In [10]:
np.save('feature_0_49.npy',features)
np.save('label_0_49.npy',labels) 