In [1]:
import os
import typing as tp
import math

import pickle
import numpy as np
import ffmpeg
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from essentia.standard import *
import essentia

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


In [2]:
def load_audio(filename, sampleRate=12000, segment_duration=None):
    audio = MonoLoader(filename=filename, sampleRate=sampleRate)()

    if segment_duration:
        segment_duration = round(segment_duration*sampleRate)
        segment_start = (len(audio) - segment_duration) // 2
        segment_end = segment_start + segment_duration
    else:
        segment_start = 0
        segment_end = len(audio)

    if segment_start < 0 or segment_end > len(audio):
        raise ValueError('Segment duration is larger than the input audio duration')

    return audio[segment_start:segment_end]


def melspectrogram(audio, 
                   sampleRate=12000, frameSize=512, hopSize=256,
                   window='hann', zeroPadding=0, center=True,
                   numberBands=96, lowFrequencyBound=0, highFrequencyBound=None,
                   weighting='linear', warpingFormula='slaneyMel',
                   normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate/2
    
    windowing = Windowing(type=window, normalized=False, zeroPadding=zeroPadding)
    spectrum = Spectrum()
    melbands = MelBands(numberBands=numberBands,
                        sampleRate=sampleRate,
                        lowFrequencyBound=lowFrequencyBound, 
                        highFrequencyBound=highFrequencyBound,
                        inputSize=(frameSize+zeroPadding)//2+1,
                        weighting=weighting,
                        normalize=normalize,
                        warpingFormula=warpingFormula,
                        type='power')
    amp2db = UnaryOperator(type='lin2db', scale=2)

    pool = essentia.Pool()
    for frame in FrameGenerator(audio, 
                                frameSize=frameSize, hopSize=hopSize,
                                startFromZero=not center):
        pool.add('mel', amp2db(melbands(spectrum(windowing(frame)))))

    return pool['mel'].T

def get_mel(in_audio_file, is_full_audio):
    """
        in_audio_file: input audio file
        out_npy_file: output NPY file to store mel-spectrogram
        is_full_audio: analyze full audio instead of a centered 29.1s segment
    """
    if is_full_audio:
        # Analyze full audio duration.
        segment_duration=None
    else:
        # Duration for the Choi's VGG model.
        segment_duration=29.1

    audio = load_audio(in_audio_file, segment_duration=segment_duration)
    mel = melspectrogram(audio)
    return mel

In [3]:
class CNN(nn.Module):
    def __init__(self, num_class=15):
        super(CNN, self).__init__()

        # init bn
        self.bn_init = nn.BatchNorm2d(1)

        # layer 1
        self.conv_1 = nn.Conv2d(1, 64, 3, padding=1)
        self.bn_1 = nn.BatchNorm2d(64)
        self.mp_1 = nn.MaxPool2d((2, 4))

        # layer 2
        self.conv_2 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn_2 = nn.BatchNorm2d(128)
        self.mp_2 = nn.MaxPool2d((2, 4))

        # layer 3
        self.conv_3 = nn.Conv2d(128, 128, 3, padding=1)
        self.bn_3 = nn.BatchNorm2d(128)
        self.mp_3 = nn.MaxPool2d((2, 4))

        # layer 4
        self.conv_4 = nn.Conv2d(128, 128, 3, padding=1)
        self.bn_4 = nn.BatchNorm2d(128)
        self.mp_4 = nn.MaxPool2d((3, 5))

        # layer 5
        self.conv_5 = nn.Conv2d(128, 64, 3, padding=1)
        self.bn_5 = nn.BatchNorm2d(64)
        self.mp_5 = nn.MaxPool2d((4, 4))

        # classifier
        self.dense = nn.Linear(64, num_class)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)

        # init bn
        x = self.bn_init(x)

        # layer 1
        x = self.mp_1(nn.ELU()(self.bn_1(self.conv_1(x))))

        # layer 2
        x = self.mp_2(nn.ELU()(self.bn_2(self.conv_2(x))))

        # layer 3
        x = self.mp_3(nn.ELU()(self.bn_3(self.conv_3(x))))

        # layer 4
        x = self.mp_4(nn.ELU()(self.bn_4(self.conv_4(x))))

        # layer 5
        x = self.mp_5(nn.ELU()(self.bn_5(self.conv_5(x))))

        # classifier
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        logit = nn.Sigmoid()(self.dense(x))

        return logit

In [4]:
def mel_iter(audio_path:str, window_size) -> tp.Generator[torch.Tensor, None, None]:
    mel = get_mel(audio_path, True)
    mel = torch.tensor(mel).cuda()
    mel_time_dim = mel.shape[1]

    for start in range(0, mel.shape[1], window_size):
        end = start+window_size
        end = end if end <= mel_time_dim else mel_time_dim

        yield mel[:, start:end]

In [5]:
model = CNN(num_class=56)

S = torch.load('/home/felipe/Documents/Github/Pt-Brdo/src/pt_brdo/mtg_jamendo_baseline/models/best_model.pth')
model.load_state_dict(S)

# inference
model.eval()
model = model.cuda()

In [None]:
def get_tag_list(option):
    if option == 'top50tags':
        tag_list = np.load('tag_list_50.npy')
    else:
        tag_list = np.load('tag_list.npy')
        if option == 'genre':
            tag_list = tag_list[:87]
        elif option == 'instrument':
            tag_list = tag_list[87:127]
        elif option == 'moodtheme':
            tag_list = tag_list[127:]
    return list(tag_list)

tags = get_tag_list('moodtheme')

for idx, tag in enumerate(tags):
    tags[idx] = tag.split('---')[-1]

tags

In [7]:
def plot_model_out(model_out:torch.Tensor, song:str):

    model_out = (model_out * 100)
    max_value = math.ceil(model_out.max().item())
    model_out = model_out.cpu().detach().numpy()[0]

    # set width of bar 
    barWidth = 0.8
    fig, ax = plt.subplots(figsize =(12 , 6)) 
    print(len(model_out))
    # Set position of bar on X axis 
    br_orig = np.arange(len(model_out))

    # Add x, y gridlines
    plt.grid(color ='grey',
            linestyle ='-.', linewidth = 0.5,
            alpha = 0.4)

    # Make the plot
    plt.bar(br_orig, model_out, color ='r', width = barWidth, edgecolor ='grey') 

    # Adding Xticks 
    plt.xlabel('Tags', fontweight ='bold', fontsize = 15) 
    plt.ylabel('Percentage', fontweight ='bold', fontsize = 15)
    plt.xticks([r + barWidth//2 for r in range(len(tags))], tags, rotation='vertical')
    plt.yticks([x for x in range(max_value)])
    plt.title(song)

    plt.legend()
    plt.show()

# Moonlight Sonata

In [8]:
moonlight_path = '/home/felipe/Desktop/moonlight_sonata.mp3'

### Central 29.1s window

In [None]:
moonlight_segment = get_mel(moonlight_path, False)
moonlight_segment = torch.tensor(moonlight_segment).cuda().unsqueeze(0)
print(moonlight_segment.shape)

In [None]:
model_out = model(moonlight_segment)
model_out

In [None]:
plot_model_out(model_out, 'Moonlight Sonata - Cetered 29.1s Segment')

# Prision Song

In [12]:
prision_path = '/home/felipe/Desktop/prison_song.mp3'

### Central 29.1s window

In [None]:
prision_segment = get_mel(prision_path, False)
prision_segment = torch.tensor(prision_segment).cuda().unsqueeze(0)
print(prision_segment.shape)

In [None]:
model_out = model(prision_segment)
model_out

In [None]:
plot_model_out(model_out, 'Prision Song - Cetered 29.1s Segment')

# 93 milion miles

In [16]:
milion_path = '/home/felipe/Desktop/93_million_miles.mp3'

### Central 29.1s window

In [None]:
milion_segment = get_mel(milion_path, False)
milion_segment = torch.tensor(milion_segment).cuda().unsqueeze(0)
print(milion_segment.shape)

In [None]:
model_out = model(milion_segment)
model_out

In [None]:
plot_model_out(model_out, '93 Milion Miles - Cetered 29.1s Segment')

### Full audio

In [None]:
# moonlight_iter = mel_iter(moonlight_path)
# my_mel = torch.tensor(my_mel).cuda()
# print(my_mel.shape)