# Data Preprocessing

In [1]:
from audiosep.data import split_data
import mutagen
import librosa
from scipy.io import wavfile
import librosa
import os
import math
import numpy as np
import tempfile
import soundfile as sf
from audiosep.data import load_data, split_data

%load_ext autoreload
%autoreload 2

## sub sample 10

# Test Model

In [56]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data()

In [57]:
X_train.shape

(5397, 130, 13, 1)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, Conv2D, MaxPool2D, Dense, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [None]:
def initialize_conv(input_shape):
    # create model
    model = Sequential()
    
    # 1st cond layer
    model.add(Conv2D(32, (3, 3), activation= 'relu', input_shape=input_shape))
    model.add(MaxPool2D((3, 3), strides= (2, 2), padding= 'same'))
    model.add(BatchNormalization())
    
    # 2nd conv layer
    model.add(Conv2D(32, (3, 3), activation= 'relu'))
    model.add(MaxPool2D((3, 3), strides= (2, 2), padding= 'same'))
    model.add(BatchNormalization())
    
    # 3rd conv layer
    model.add(Conv2D(32, (2, 2), activation= 'relu'))
    model.add(MaxPool2D((2, 2), strides= (2, 2), padding= 'same'))
    model.add(BatchNormalization())
    
    # flatten to 1D array and feed to dense
    model.add(Flatten())
    model.add(Dense(64, activation= 'relu'))
    model.add(Dropout(0.3))
    
    # output layer
    model.add(Dense(9, activation= 'softmax'))
    
    # compile model
    model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(learning_rate=0.0003),
              metrics=['accuracy'])
    
    return model

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])

In [None]:
input_shape

In [None]:
cnn_model = initialize_conv(input_shape)
cnn_model.summary()

In [None]:
es = EarlyStopping(patience=10, restore_best_weights=True)

history = cnn_model.fit(X_train, y_train,
                      epochs= 30,
                      batch_size= 32,
                      validation_data= (X_val, y_val),
                      verbose=2,
                      callbacks= [es])

## On new wavfile

In [22]:
def split_audio(test_data):
    signal, sr = librosa.load(test_data)

    start = 0
    split_data = {}
    duration = int(librosa.get_duration(signal)) # duration
    
    
    if duration < 30:
        print("Please upload a song of at least 30 seconds")
        
    else:
        batch_size = sr * 30  # get num of vectors per 30 secs
        length = len(signal) # total len of signal array
        num_iter = duration // 30 # model trained with 30 second clips
        # num of times to loop over len 
        for i in range(num_iter):
            # slice 30 seconds--> batch size 
            split_data['batch_'+ str(i+1)] = signal[start: start+batch_size]
            start += batch_size  #update start index
    
        return split_data

# should give the num of splits as well

# possibly average the predictions of each split

In [23]:
split = split_audio(mine_2)



In [24]:
split.keys()

dict_keys(['batch_1', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6', 'batch_7', 'batch_8'])

In [28]:
for k, v in split.items():
    print(v)

[3.4217774e-06 5.7808688e-06 4.2841502e-06 ... 2.0577137e-03 7.7987816e-03
 8.9051444e-03]
[ 0.00615656 -0.00110329 -0.00772048 ...  0.02088888  0.00844402
  0.018674  ]
[ 0.02040437  0.02304992  0.02929443 ... -0.06571693 -0.06310181
 -0.05849043]
[-0.05704912 -0.05775296 -0.05538409 ... -0.05545298 -0.0526839
 -0.06940664]
[-0.06411405 -0.11004042 -0.14783971 ... -0.01402255 -0.01217717
 -0.01025919]
[-0.00735044 -0.0028128   0.00171317 ...  0.03606275  0.04865115
  0.06385478]
[0.08133204 0.09838188 0.11201944 ... 0.01314554 0.01085493 0.00230194]
[-0.00295769 -0.01142887 -0.0168938  ...  0.06745723  0.05885477
  0.05284265]


In [None]:
#create temp file and save split wavefiles
#path = os.getcwd()
#f = tempfile.TemporaryDirectory(dir= path)
#for key, val in test.items():
#    sf.write(f'{f.name}/{key}.wav', val, 22050)

In [None]:
def predict(model, X):
    
    # reshape X
    if X.ndim == 3:
        X = X[np.newaxis, ...] # to match model input shape
    else:
        X = X[np.newaxis, ..., np.newaxis]
    
    # get predictions
    pred = model.predict(X) # pred is 2D array of probs for each genre class
    
    # extract index with max val
    pred = np.argmax(pred, axis=1)[0]
    
    return pred

In [None]:
def predict_new(model, X):
    
    X, num_x = split_audio(X)
    
    predictions = []

    for key, val in X.items():
        mfcc = get_mfcc(val)
        pred = predict(model, mfcc)
        predictions.append(pred)

    predictions = np.array(predictions)
    
    pred = np.argmax(predictions)
    
    # labels gotten from data.json['mapping']
    genres ={0: "Blues",
         1: "Classical",
         2: "Country",
         3: "Disco",
         4: "Hiphop",
         5: "Metal",
         6: "pop",
         7: "Reggae",
         8: "Rock"
        }
    
    print(f"Predicted genre: {genres.get(pred)}")
    
    return genres.get(pred)
    
    

In [18]:
mine_2 = 'C:/Users/cezea/Desktop/MUSIC/exports/Kulture.mp3'

In [None]:
predict_new(cnn_model, mine_2)