In [2]:
import librosa
from tqdm  import tqdm
import os
import glob
import numpy as np
import pickle
from pathlib import Path

# Load the audio file
# AUDIO_FILE = '/home/amir/uni/ml/mbusic/piano/Annelie_-_Lost.mp3'

In [3]:
file_list = glob.glob("./data/*/*.mp3")

In [4]:
def file_seperate(file_list):
    data = dict()
    for x in file_list:
        if os.path.getsize(x) > 100000:  # 10 K!
            key = x.split('/')[2]
            data.setdefault(key, list()).append(x)
    return data

In [5]:
data = file_seperate(file_list)
print([(k, len(v)) for k,v in data.items()])

[('santour', 241), ('violin', 258), ('tar', 232), ('setar', 263), ('piano', 240), ('ney', 256)]


In [6]:
class AudioFeature:
    def __init__(self, src_path, fold, label):
        self.src_path = src_path
        self.fold = fold
        self.label = label
        self.y, self.sr = librosa.load(self.src_path, mono=True)
        self.features = None

    def _concat_features(self, feature):
        """
        Whenever a self._extract_xxx() method is called in this class,
        this function concatenates to the self.features feature vector
        """
        self.features = np.hstack(
            [self.features, feature] if self.features is not None else feature
        )

    def _extract_mfcc(self, n_mfcc=25):
        mfcc = librosa.feature.mfcc(self.y, sr=self.sr, n_mfcc=n_mfcc)

        mfcc_mean = mfcc.mean(axis=1).T
        mfcc_std = mfcc.std(axis=1).T
        mfcc_feature = np.hstack([mfcc_mean, mfcc_std])
        self._concat_features(mfcc_feature)

    def _extract_spectral_contrast(self, n_bands=3):
        spec_con = librosa.feature.spectral_contrast(
            y=self.y, sr=self.sr, n_bands=n_bands
        )

        spec_con_mean = spec_con.mean(axis=1).T
        spec_con_std = spec_con.std(axis=1).T
        spec_con_feature = np.hstack([spec_con_mean, spec_con_std])
        self._concat_features(spec_con_feature)

    def _extract_chroma_stft(self):
        stft = np.abs(librosa.stft(self.y))
        chroma_stft = librosa.feature.chroma_stft(S=stft, sr=self.sr)
        chroma_mean = chroma_stft.mean(axis=1).T
        chroma_std = chroma_stft.std(axis=1).T
        chroma_feature = np.hstack([chroma_mean, chroma_std])
        self._concat_features(chroma_feature)

    def extract_features(self, *feature_list, save_local=True):
        """
        Specify a list of features to extract, and a feature vector will be
        built for you for a given Audio sample.
        By default the extracted feature and class attributes will be saved in
        a local directory. This can be turned off with save_local=False.
        """
        extract_fn = dict(
            mfcc=self._extract_mfcc,
            spectral=self._extract_spectral_contrast,
            chroma=self._extract_chroma_stft,
        )

        for feature in feature_list:
            extract_fn[feature]()

        if save_local:
            self._save_local()

    def _save_local(self, clean_source=True):
        out_name = self.src_path.split("/")[-1]
        out_name = out_name.replace(".mp3", "")

        filename = f"./data/{self.label}/fold{self.fold}/{out_name}.pkl"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "wb") as f:
            pickle.dump(self, f)

        if clean_source:
            self.y = None

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
audio_features = []
feature_matrix = []
labels = []
folds = []
fold=6
for key, value in data.items():
    print(key)
    for audio_file in tqdm(value):
        # print(audio_file)
        fn = audio_file.split("/")[-1]
        fn = fn.replace(".mp3", "")
        filename = f"./data/{key}/fold{fold}/{fn}.pkl"
        if os.path.isfile(filename):
            with open(filename, 'rb') as f:
                audio = pickle.load(f)
                # audio_features.append(audio.features)
                feature_matrix.append(audio.features)
                labels.append(audio.label)
                folds.append(audio.fold)
        else:
            audio = AudioFeature(audio_file, 5, label=key)
            audio.extract_features("mfcc", "spectral", "chroma", save_local=True)
            # audio_features.append(audio)

santour


100%|██████████| 241/241 [00:07<00:00, 32.28it/s]


violin


100%|██████████| 258/258 [00:06<00:00, 42.37it/s]


tar


100%|██████████| 232/232 [00:06<00:00, 35.97it/s]


setar


100%|██████████| 263/263 [00:08<00:00, 30.99it/s]


piano


100%|██████████| 240/240 [00:05<00:00, 40.58it/s]


ney


100%|██████████| 256/256 [00:06<00:00, 42.21it/s]


In [9]:
X = np.vstack(feature_matrix)
y = np.array(labels)

folds = np.array(folds)

In [29]:
folds.shape

(1490,)

In [39]:
sam = [6 for i in range(folds.shape[0])] 

In [43]:
sam = np.array(sam)

In [14]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [23]:
print(X.shape)

(1490, 82)


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
model_cfg = dict(
    model=RandomForestClassifier(
        random_state=42,
        n_jobs=10,
        class_weight="balanced",
        n_estimators=500,
        bootstrap=True,
    ),
)

In [13]:
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import random


In [15]:
logo = LeaveOneGroupOut()
val_fold_scores = []
for train_index, test_index in logo.split(X, y, sam):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    ss = StandardScaler(copy=True)
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)

    clf = RandomForestClassifier(
        random_state=42,
        n_jobs=10,
        class_weight="balanced",
        n_estimators=500,
        bootstrap=True,
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    fold_acc = accuracy_score(y_test, y_pred)
    val_fold_scores.append(fold_acc)

NameError: name 'sam' is not defined

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

ss = StandardScaler(copy=True)
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

clf = RandomForestClassifier(
    random_state=42,
    n_jobs=10,
    class_weight="balanced",
    n_estimators=500,
    bootstrap=True,
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

fold_acc = accuracy_score(y_test, y_pred)

In [19]:
encoder.classes_

array(['ney', 'piano', 'santour', 'setar', 'tar', 'violin'], dtype='<U7')

In [47]:
y_test

array([2, 0, 5, 3, 1, 2, 1, 3, 0, 3, 3, 2, 4, 1, 3, 1, 5, 3, 1, 0, 1, 3,
       1, 3, 3, 5, 5, 0, 3, 2, 2, 1, 1, 4, 2, 5, 4, 2, 3, 3, 1, 4, 1, 0,
       3, 2, 3, 3, 5, 4, 5, 2, 1, 2, 3, 1, 5, 4, 3, 1, 3, 1, 0, 0, 4, 0,
       4, 1, 5, 3, 0, 1, 3, 4, 4, 4, 0, 1, 2, 3, 5, 4, 0, 3, 0, 0, 2, 0,
       2, 2, 4, 3, 5, 1, 5, 4, 5, 0, 3, 5, 1, 3, 0, 5, 5, 3, 4, 1, 2, 0,
       2, 4, 4, 5, 4, 5, 0, 2, 1, 3, 0, 4, 3, 5, 1, 4, 1, 3, 1, 3, 5, 5,
       0, 1, 1, 5, 3, 2, 2, 1, 0, 2, 2, 3, 1, 3, 4, 2, 5, 4, 5, 1, 4, 2,
       5, 2, 5, 4, 4, 3, 5, 5, 3, 0, 5, 1, 1, 2, 0, 2, 4, 4, 0, 4, 2, 0,
       5, 1, 4, 5, 4, 3, 4, 5, 3, 1, 3, 5, 0, 5, 5, 5, 0, 0, 3, 0, 4, 4,
       4, 4, 3, 4, 2, 0, 4, 2, 1, 0, 0, 5, 1, 3, 5, 1, 4, 0, 3, 4, 0, 4,
       4, 1, 1, 5, 2, 1, 2, 1, 3, 0, 5, 1, 2, 2, 2, 1, 2, 5, 5, 4, 2, 2,
       1, 5, 4, 5, 3, 3, 5, 1, 5, 0, 1, 0, 3, 2, 3, 0, 2, 0, 1, 3, 4, 5,
       2, 1, 5, 5, 2, 1, 1, 4, 5, 4, 5, 0, 0, 4, 4, 5, 0, 0, 1, 4, 2, 2,
       1, 1, 0, 0, 0, 5, 0, 0, 0, 0, 4, 1])

In [20]:
encoder.inverse_transform(y_test)

array(['santour', 'ney', 'violin', 'setar', 'piano', 'santour', 'piano',
       'setar', 'ney', 'setar', 'setar', 'santour', 'tar', 'piano',
       'setar', 'piano', 'violin', 'setar', 'piano', 'ney', 'piano',
       'setar', 'piano', 'setar', 'setar', 'violin', 'violin', 'ney',
       'setar', 'santour', 'santour', 'piano', 'piano', 'tar', 'santour',
       'violin', 'tar', 'santour', 'setar', 'setar', 'piano', 'tar',
       'piano', 'ney', 'setar', 'santour', 'setar', 'setar', 'violin',
       'tar', 'violin', 'santour', 'piano', 'santour', 'setar', 'piano',
       'violin', 'tar', 'setar', 'piano', 'setar', 'piano', 'ney', 'ney',
       'tar', 'ney', 'tar', 'piano', 'violin', 'setar', 'ney', 'piano',
       'setar', 'tar', 'tar', 'tar', 'ney', 'piano', 'santour', 'setar',
       'violin', 'tar', 'ney', 'setar', 'ney', 'ney', 'santour', 'ney',
       'santour', 'santour', 'tar', 'setar', 'violin', 'piano', 'violin',
       'tar', 'violin', 'ney', 'setar', 'violin', 'piano', 'setar', 

In [48]:
fold_acc

0.9060402684563759

In [28]:
sample = X_test[0]
pred = clf.predict([sample])
pred_label = encoder.inverse_transform([pred])
real_label = encoder.inverse_transform([y_test[0]])
print(f"real lable is {real_label} and prediction is {pred_label}")

real lable is ['santour'] and prediction is ['santour']


In [25]:
y_test[0]

2