In [1]:
import numpy as np
import pandas as pd

import os
import librosa
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
from scipy.stats import skew
SAMPLE_RATE = 44100

#from sklearn.model_selection import KFold, RepeatedKFold
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()
import scipy

In [2]:
data_path = "../freesound-audio-tagging"

In [3]:
ss = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))

In [4]:
audio_train_files = os.listdir(os.path.join(data_path, "audio_train"))
audio_test_files = os.listdir(os.path.join(data_path, "audio_test"))

train = pd.read_csv(os.path.join(data_path, "train.csv"))
submission = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

In [5]:
#function from EDA kernel: https://www.kaggle.com/codename007/a-very-extensive-freesound-exploratory-analysis
def clean_filename(fname, string):   
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

#returns mfcc features with mean and standard deviation along time
def get_mfcc(name, path):
    b, _ = librosa.core.load(path + name, sr = SAMPLE_RATE)
    assert _ == SAMPLE_RATE
    try:
        ft1 = librosa.feature.mfcc(b, sr = SAMPLE_RATE, n_mfcc=20)
        ft2 = librosa.feature.zero_crossing_rate(b)[0]
        ft3 = librosa.feature.spectral_rolloff(b)[0]
        ft4 = librosa.feature.spectral_centroid(b)[0]
        ft5 = librosa.feature.spectral_contrast(b)[0]
        ft6 = librosa.feature.spectral_bandwidth(b)[0]
        ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.min(ft1, axis = 1)))
        ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2)))
        ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3)))
        ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4)))
        ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5)))
        ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.min(ft6)))
        return pd.Series(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    except:
        raise ValueError
        pass
    #    print('bad file')
    #    return pd.Series([0]*125)

In [6]:
#preparing data
train_data = pd.DataFrame()
train_data['fname'] = train['fname']
test_data = pd.DataFrame()
test_data['fname'] = audio_test_files

train_data = train_data['fname'].progress_apply(get_mfcc, path=os.path.join(data_path, "audio_train/"))
print('done loading train mfcc')
test_data = test_data['fname'].progress_apply(get_mfcc, path=os.path.join(data_path, "audio_test/"))
print('done loading test mfcc')

train_data['fname'] = train['fname']
test_data['fname'] = audio_test_files
train_data['label'] = train['label']
test_data['label'] = np.zeros((len(audio_test_files)))

100%|██████████| 9473/9473 [42:34<00:00,  2.68it/s]  
  0%|          | 2/9400 [00:00<11:28, 13.64it/s]

done loading train mfcc


  2%|▏         | 221/9400 [00:49<43:21,  3.53it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115,116,117,118,119,120,121,122,123,124
0,-446.838467,52.3723,-53.377823,-3.740264,-8.318184,-11.119042,-14.467405,-10.285843,-10.966904,-2.670278,...,16.096142,3.326795,0.937812,33.705153,7.055138,1989.168387,775.044969,0.714498,3543.959962,1122.194058
1,-375.754678,151.572502,-41.933635,-0.165071,5.955898,-12.535086,-13.192531,-13.657687,-9.174413,-19.668802,...,33.531276,5.912794,-1.262496,60.328617,6.676251,906.662232,506.447373,3.305549,3388.692162,506.16207
2,-658.236035,102.368917,20.626247,9.108918,-4.891736,7.259463,-1.86253,1.661778,5.499974,5.338336,...,18.181105,5.856589,0.628735,35.780019,8.280023,2840.134501,611.530727,-0.202585,3631.676275,1516.874775
3,-770.90545,16.077204,-3.556132,-10.957762,-19.071291,-9.632298,4.279781,14.633798,19.459229,12.089039,...,21.089348,3.05948,0.818964,36.509611,13.943471,2758.526365,772.74194,-0.909257,3553.92427,1216.427458
4,-343.603965,137.873119,-28.701815,8.59318,2.751699,-0.301433,-9.502459,-3.774746,13.056789,7.197487,...,22.7689,4.313505,-0.279604,35.938348,8.232091,1639.31225,657.051222,1.086833,3113.026574,1088.557747


In [None]:
#Features from LightGBM baseline kernel: https://www.kaggle.com/opanichev/lightgbm-baseline
# MAPk from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


def extract_features(files, path):
    features = {}

    cnt = 0
    for f in tqdm(files):
        features[f] = {}

        fs, data = scipy.io.wavfile.read(os.path.join(path, f))

        abs_data = np.abs(data)
        diff_data = np.diff(data)

        def calc_part_features(data, n=2, prefix=''):
            f_i = 1
            for i in range(0, len(data), len(data)//n):
                features[f]['{}mean_{}_{}'.format(prefix, f_i, n)] = np.mean(data[i:i + len(data)//n])
                features[f]['{}std_{}_{}'.format(prefix, f_i, n)] = np.std(data[i:i + len(data)//n])
                features[f]['{}min_{}_{}'.format(prefix, f_i, n)] = np.min(data[i:i + len(data)//n])
                features[f]['{}max_{}_{}'.format(prefix, f_i, n)] = np.max(data[i:i + len(data)//n])

        features[f]['len'] = len(data)
        if features[f]['len'] > 0:
            n = 1
            calc_part_features(data, n=n)
            calc_part_features(abs_data, n=n, prefix='abs_')
            calc_part_features(diff_data, n=n, prefix='diff_')

            n = 2
            calc_part_features(data, n=n)
            calc_part_features(abs_data, n=n, prefix='abs_')
            calc_part_features(diff_data, n=n, prefix='diff_')

            n = 3
            calc_part_features(data, n=n)
            calc_part_features(abs_data, n=n, prefix='abs_')
            calc_part_features(diff_data, n=n, prefix='diff_')


        cnt += 1

        # if cnt >= 1000:
        #     break

    features = pd.DataFrame(features).T.reset_index()
    features.rename(columns={'index': 'fname'}, inplace=True)
    
    return features

path = os.path.join(data_path, 'audio_train')
train_files = train.fname.values
train_features = extract_features(train_files, path)

path = os.path.join(data_path, 'audio_test')
test_files = ss.fname.values
test_features = extract_features(test_files, path)


  0%|          | 0/9473 [00:00<?, ?it/s][A
  0%|          | 2/9473 [00:00<10:45, 14.67it/s][A
  0%|          | 6/9473 [00:00<06:16, 25.16it/s][A
  0%|          | 10/9473 [00:00<06:34, 23.98it/s][A
  0%|          | 13/9473 [00:00<06:18, 25.02it/s][A
  0%|          | 18/9473 [00:00<05:48, 27.14it/s][A
  0%|          | 24/9473 [00:00<05:01, 31.39it/s][A
  0%|          | 28/9473 [00:00<05:00, 31.40it/s][A
  0%|          | 32/9473 [00:01<04:56, 31.87it/s][A
  0%|          | 36/9473 [00:01<04:53, 32.14it/s][A
  0%|          | 40/9473 [00:01<05:07, 30.69it/s][A
  0%|          | 47/9473 [00:01<04:51, 32.28it/s][A
  1%|          | 54/9473 [00:01<04:40, 33.62it/s][A
  1%|          | 61/9473 [00:01<04:24, 35.65it/s][A
  1%|          | 66/9473 [00:01<04:26, 35.25it/s][A
  1%|          | 71/9473 [00:02<04:24, 35.49it/s][A
  1%|          | 76/9473 [00:02<04:31, 34.64it/s][A
  1%|          | 80/9473 [00:02<04:34, 34.25it/s][A
  1%|          | 86/9473 [00:02<04:26, 35.25it/s][A
  1

  9%|▉         | 874/9473 [00:20<03:24, 42.09it/s][A
  9%|▉         | 882/9473 [00:20<03:23, 42.24it/s][A
  9%|▉         | 889/9473 [00:20<03:22, 42.38it/s][A
  9%|▉         | 895/9473 [00:21<03:22, 42.31it/s][A
 10%|▉         | 903/9473 [00:21<03:21, 42.45it/s][A
 10%|▉         | 909/9473 [00:21<03:21, 42.53it/s][A
 10%|▉         | 915/9473 [00:21<03:21, 42.44it/s][A
 10%|▉         | 920/9473 [00:21<03:21, 42.37it/s][A
 10%|▉         | 927/9473 [00:21<03:21, 42.36it/s][A
 10%|▉         | 933/9473 [00:21<03:21, 42.44it/s][A
 10%|▉         | 941/9473 [00:22<03:20, 42.60it/s][A
 10%|█         | 949/9473 [00:22<03:19, 42.76it/s][A
 10%|█         | 956/9473 [00:22<03:19, 42.69it/s][A
 10%|█         | 963/9473 [00:22<03:18, 42.81it/s][A
 10%|█         | 969/9473 [00:22<03:18, 42.78it/s][A
 10%|█         | 975/9473 [00:22<03:18, 42.80it/s][A
 10%|█         | 981/9473 [00:22<03:18, 42.78it/s][A
 10%|█         | 991/9473 [00:23<03:17, 43.02it/s][A
 11%|█         | 998/9473 [0

 19%|█▊        | 1761/9473 [00:41<03:00, 42.81it/s][A
 19%|█▊        | 1766/9473 [00:41<03:00, 42.75it/s][A
 19%|█▊        | 1773/9473 [00:41<03:00, 42.78it/s][A
 19%|█▉        | 1778/9473 [00:41<02:59, 42.79it/s][A
 19%|█▉        | 1783/9473 [00:41<02:59, 42.76it/s][A
 19%|█▉        | 1789/9473 [00:41<02:59, 42.77it/s][A
 19%|█▉        | 1795/9473 [00:41<02:59, 42.81it/s][A
 19%|█▉        | 1801/9473 [00:42<02:59, 42.85it/s][A
 19%|█▉        | 1807/9473 [00:42<02:59, 42.81it/s][A
 19%|█▉        | 1815/9473 [00:42<02:58, 42.82it/s][A
 19%|█▉        | 1825/9473 [00:42<02:58, 42.95it/s][A
 19%|█▉        | 1831/9473 [00:42<02:57, 42.94it/s][A
 19%|█▉        | 1837/9473 [00:42<02:57, 42.94it/s][A
 19%|█▉        | 1845/9473 [00:42<02:57, 43.01it/s][A
 20%|█▉        | 1851/9473 [00:43<02:57, 43.01it/s][A
 20%|█▉        | 1857/9473 [00:43<02:57, 42.99it/s][A
 20%|█▉        | 1862/9473 [00:43<02:57, 42.93it/s][A
 20%|█▉        | 1867/9473 [00:43<02:57, 42.92it/s][A
 20%|█▉   

 28%|██▊       | 2644/9473 [01:01<02:39, 42.89it/s][A
 28%|██▊       | 2653/9473 [01:01<02:38, 42.95it/s][A
 28%|██▊       | 2660/9473 [01:01<02:38, 42.95it/s][A
 28%|██▊       | 2666/9473 [01:02<02:38, 42.96it/s][A
 28%|██▊       | 2672/9473 [01:02<02:38, 42.99it/s][A
 28%|██▊       | 2678/9473 [01:02<02:38, 42.93it/s][A
 28%|██▊       | 2684/9473 [01:02<02:38, 42.95it/s][A
 28%|██▊       | 2689/9473 [01:02<02:38, 42.90it/s][A
 28%|██▊       | 2696/9473 [01:02<02:37, 42.94it/s][A
 29%|██▊       | 2701/9473 [01:02<02:37, 42.95it/s][A
 29%|██▊       | 2709/9473 [01:03<02:37, 42.99it/s][A
 29%|██▊       | 2715/9473 [01:03<02:37, 42.99it/s][A
 29%|██▊       | 2721/9473 [01:03<02:36, 43.01it/s][A
 29%|██▉       | 2727/9473 [01:03<02:37, 42.95it/s][A
 29%|██▉       | 2734/9473 [01:03<02:36, 42.98it/s][A
 29%|██▉       | 2742/9473 [01:03<02:36, 43.04it/s][A
 29%|██▉       | 2749/9473 [01:03<02:36, 43.07it/s][A
 29%|██▉       | 2756/9473 [01:04<02:36, 43.01it/s][A
 29%|██▉  

 38%|███▊      | 3558/9473 [01:21<02:16, 43.44it/s][A
 38%|███▊      | 3563/9473 [01:22<02:16, 43.44it/s][A
 38%|███▊      | 3569/9473 [01:22<02:15, 43.45it/s][A
 38%|███▊      | 3576/9473 [01:22<02:15, 43.47it/s][A
 38%|███▊      | 3583/9473 [01:22<02:15, 43.50it/s][A
 38%|███▊      | 3592/9473 [01:22<02:15, 43.55it/s][A
 38%|███▊      | 3599/9473 [01:22<02:14, 43.56it/s][A
 38%|███▊      | 3608/9473 [01:22<02:14, 43.61it/s][A
 38%|███▊      | 3615/9473 [01:22<02:14, 43.61it/s][A
 38%|███▊      | 3621/9473 [01:23<02:14, 43.60it/s][A
 38%|███▊      | 3627/9473 [01:23<02:14, 43.58it/s][A
 38%|███▊      | 3632/9473 [01:23<02:14, 43.59it/s][A
 38%|███▊      | 3638/9473 [01:23<02:13, 43.60it/s][A
 38%|███▊      | 3643/9473 [01:23<02:13, 43.59it/s][A
 39%|███▊      | 3649/9473 [01:23<02:13, 43.61it/s][A
 39%|███▊      | 3658/9473 [01:23<02:13, 43.66it/s][A
 39%|███▊      | 3665/9473 [01:23<02:13, 43.64it/s][A
 39%|███▉      | 3672/9473 [01:24<02:12, 43.64it/s][A
 39%|███▉ 

In [None]:
train_data = train_data.merge(train_features, on='fname', how='left')
test_data = test_data.merge(test_features, on='fname', how='left')
train_data.head()

In [None]:
#Functions from LightGBM baseline: https://www.kaggle.com/opanichev/lightgbm-baseline
# Construct features set
X = train_data.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)
X = X.values
labels = np.sort(np.unique(train_data.label.values))
num_class = len(labels)
c2i = {}
i2c = {}
for i, c in enumerate(labels):
    c2i[c] = i
    i2c[i] = c
y = np.array([c2i[x] for x in train_data.label.values])

In [None]:
#fitting xgboost on the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, shuffle = True)
clf = XGBClassifier(max_depth=5, learning_rate=0.05, n_estimators=3000,
                    n_jobs=-1, random_state=0, reg_alpha=0.2, 
                    colsample_bylevel=0.9, colsample_bytree=0.9)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_val), y_val))
#more functions from LightGBM baseline: https://www.kaggle.com/opanichev/lightgbm-baseline
def proba2labels(preds, i2c, k=3):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append(' '.join([i2c[i] for i in idx[:k]]))

    return ans, ids

In [None]:
#fitting on the entire data

clf.fit(X, y)
str_preds, _ = proba2labels(clf.predict_proba(test_data.drop(['label', 'fname'], axis = 1).values), i2c, k=3)
# Prepare submission
subm = pd.DataFrame()
subm['fname'] = audio_test_files
subm['label'] = str_preds
subm.to_csv('submission.csv', index=False)