# Preprocessing

Imports

In [1]:
import os
import math
import json
import librosa
import numpy as np
from sklearn.model_selection import train_test_split

Constants declared

In [2]:
DATASET_PATH = "../raw_data/genre/genres_original/"
JSON_PATH = "../raw_data/genre/data.json "
SAMPLE_RATE = 22050
DURATION = 30 # seconds per track
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

Test a file

In [None]:
_ = '../raw_data/genre/genres_original/hiphop/hiphop.00007.wav'

In [None]:
import IPython
IPython.display.Audio(filename= _)

## Retrieve MFCCs

Function created to walk through path, extract labels, perform STFT tranforms on audio files and store the MFCCs from those. 

In [3]:
def save_mfcc(dataset_path, json_path, n_mfcc= 13, n_fft= 2048, hop_length= 512, num_segments= 5):
    SAMPLE_RATE = 22050
    DURATION = 30 # seconds per track
    SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
    
    # dictionary to store data
    data = {
        "mapping": [],
        "mfcc": [],
        "labels": []
    }
    
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_nmfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length)
    
    # loop through all genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        # ensure we are not at root level
        if dirpath is not dataset_path:
            # save the semantic label
            semantic_label = dirpath.split("/")[-1] # genre/blues => ['genre', 'blues']
            data['mapping'].append(semantic_label)
            print(f"\nProcessing {semantic_label}")
            
            # process files for a specific genre
            for f in filenames:
                # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                
                # process by segment, extracting mfcc and storing data
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s 
                    finish_sample = start_sample + num_samples_per_segment
                    
                    mfcc = librosa.feature.mfcc(signal[start_sample: finish_sample],
                                                sr=sr,
                                                n_fft= n_fft,
                                                n_mfcc= n_mfcc,
                                                hop_length = hop_length)
                    mfcc = mfcc.T
                    
                    # store mfcc for segment if it has the expected length
                    if len(mfcc) == expected_nmfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print(f"{file_path}, segment: {s+1}")
                        
        with open(json_path, 'w') as fp:
            json.dump(data, fp, indent=4)

Result stored as `.json` file below.

In [None]:
#save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)

## Load data

In [9]:
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
    # convert lists to np.array e.g 'mapping', 'labels' in json file
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])
    
    return inputs, targets

Test loading

In [10]:
inputs, targets = load_data(JSON_PATH)

In [11]:
print(inputs.shape)
print(targets.shape)

(8996, 130, 13)
(8996,)


Data labels

In [16]:
for semantic_label, target_label in zip(semantic, np.unique(targets)):
    print(f"Target label for {semantic_label}: {target_label}")

Target label for blues: 0
Target label for classical: 1
Target label for country: 2
Target label for disco: 3
Target label for hiphop: 4
Target label for metal: 5
Target label for pop: 6
Target label for reggae: 7
Target label for rock: 8


### Train / Test Split

A simple `train_test_split` can be performed on the data for the baseline multilayer peceptron model. 

The following function would be used to split data into `train | test | validation` for the CNN model:

In [17]:
def split_data(test_size=0.25, validation_size= 0.2):
    
    # load_data
    X, y, smeantic = load_data(JSON_PATH)
    
    # train / test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    # train / val split
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size)
    
    # 3D array for each sample eg --> (130, 13, 1)
    X_train = X_train[..., np.newaxis] # 4d array --> (num_samples, 130, 13, 1)
    X_val =X_val[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [20]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data()