In [1]:
import os

import pickle
import numpy as np
import ffmpeg

import torch
import torch.nn as nn

from essentia.standard import *
import essentia

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


In [2]:
def get_dictionary(fn):
    with open(fn, 'rb') as pf:
        dictionary = pickle.load(pf)
    return dictionary

In [3]:
fn = '/home/felipe/Documents/Github/Pt-Brdo/playground/notebooks/moodtheme_test_dict.pickle'
mel_dict = get_dictionary(fn)

In [4]:
index = 0

fn = os.path.join('/home/felipe/Desktop/mtg-mood-theme-mel', mel_dict[index]['path'][:-3]+'npy')
print(fn)


tags = mel_dict[index]['tags']
tags = tags.astype('float32')

path = mel_dict[index]['path']

audio = np.array(np.load(fn))
audio = audio.astype('float32')
audio = torch.tensor(audio).cuda()

print(path)
print(audio.shape)
audio

/home/felipe/Desktop/mtg-mood-theme-mel/00/13400.npy
00/13400.mp3
torch.Size([96, 9602])


tensor([[-69.5358, -48.5393, -56.4544,  ..., -90.0000, -90.0000, -90.0000],
        [-64.7463, -49.1767, -52.0730,  ..., -90.0000, -90.0000, -90.0000],
        [-61.8604, -57.5374, -57.4104,  ..., -90.0000, -90.0000, -90.0000],
        ...,
        [-90.0000, -90.0000, -88.5863,  ..., -90.0000, -90.0000, -90.0000],
        [-90.0000, -90.0000, -90.0000,  ..., -90.0000, -90.0000, -90.0000],
        [-90.0000, -90.0000, -90.0000,  ..., -90.0000, -90.0000, -90.0000]],
       device='cuda:0')

In [5]:
def load_audio(filename, sampleRate=12000, segment_duration=None):
    audio = MonoLoader(filename=filename, sampleRate=sampleRate)()

    if segment_duration:
        segment_duration = round(segment_duration*sampleRate)
        segment_start = (len(audio) - segment_duration) // 2
        segment_end = segment_start + segment_duration
    else:
        segment_start = 0
        segment_end = len(audio)

    if segment_start < 0 or segment_end > len(audio):
        raise ValueError('Segment duration is larger than the input audio duration')

    return audio[segment_start:segment_end]


def melspectrogram(audio, 
                   sampleRate=12000, frameSize=512, hopSize=256,
                   window='hann', zeroPadding=0, center=True,
                   numberBands=96, lowFrequencyBound=0, highFrequencyBound=None,
                   weighting='linear', warpingFormula='slaneyMel',
                   normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate/2
    
    windowing = Windowing(type=window, normalized=False, zeroPadding=zeroPadding)
    spectrum = Spectrum()
    melbands = MelBands(numberBands=numberBands,
                        sampleRate=sampleRate,
                        lowFrequencyBound=lowFrequencyBound, 
                        highFrequencyBound=highFrequencyBound,
                        inputSize=(frameSize+zeroPadding)//2+1,
                        weighting=weighting,
                        normalize=normalize,
                        warpingFormula=warpingFormula,
                        type='power')
    amp2db = UnaryOperator(type='lin2db', scale=2)

    pool = essentia.Pool()
    for frame in FrameGenerator(audio, 
                                frameSize=frameSize, hopSize=hopSize,
                                startFromZero=not center):
        pool.add('mel', amp2db(melbands(spectrum(windowing(frame)))))

    return pool['mel'].T

def get_mel(in_audio_file, is_full_audio):
    """
        in_audio_file: input audio file
        out_npy_file: output NPY file to store mel-spectrogram
        is_full_audio: analyze full audio instead of a centered 29.1s segment
    """
    if is_full_audio:
        # Analyze full audio duration.
        segment_duration=None
    else:
        # Duration for the Choi's VGG model.
        segment_duration=29.1

    audio = load_audio(in_audio_file, segment_duration=segment_duration)
    mel = melspectrogram(audio)
    return mel

In [6]:
class CNN(nn.Module):
    def __init__(self, num_class=15):
        super(CNN, self).__init__()

        # init bn
        self.bn_init = nn.BatchNorm2d(1)

        # layer 1
        self.conv_1 = nn.Conv2d(1, 64, 3, padding=1)
        self.bn_1 = nn.BatchNorm2d(64)
        self.mp_1 = nn.MaxPool2d((2, 4))

        # layer 2
        self.conv_2 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn_2 = nn.BatchNorm2d(128)
        self.mp_2 = nn.MaxPool2d((2, 4))

        # layer 3
        self.conv_3 = nn.Conv2d(128, 128, 3, padding=1)
        self.bn_3 = nn.BatchNorm2d(128)
        self.mp_3 = nn.MaxPool2d((2, 4))

        # layer 4
        self.conv_4 = nn.Conv2d(128, 128, 3, padding=1)
        self.bn_4 = nn.BatchNorm2d(128)
        self.mp_4 = nn.MaxPool2d((3, 5))

        # layer 5
        self.conv_5 = nn.Conv2d(128, 64, 3, padding=1)
        self.bn_5 = nn.BatchNorm2d(64)
        self.mp_5 = nn.MaxPool2d((4, 4))

        # classifier
        self.dense = nn.Linear(64, num_class)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)

        # init bn
        x = self.bn_init(x)

        # layer 1
        x = self.mp_1(nn.ELU()(self.bn_1(self.conv_1(x))))

        # layer 2
        x = self.mp_2(nn.ELU()(self.bn_2(self.conv_2(x))))

        # layer 3
        x = self.mp_3(nn.ELU()(self.bn_3(self.conv_3(x))))

        # layer 4
        x = self.mp_4(nn.ELU()(self.bn_4(self.conv_4(x))))

        # layer 5
        x = self.mp_5(nn.ELU()(self.bn_5(self.conv_5(x))))

        # classifier
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        logit = nn.Sigmoid()(self.dense(x))

        return logit

In [7]:
ffmpeg.probe('/home/felipe/Desktop/13400.mp3')['streams'][0]['sample_rate']

'44100'

In [8]:
my_mel = get_mel('/home/felipe/Desktop/13400.mp3', False)
my_mel = torch.tensor(my_mel).cuda()
print(my_mel.shape)

torch.Size([96, 1366])


In [9]:
model = CNN(num_class=56)

S = torch.load('/home/felipe/Documents/Github/Pt-Brdo/src/pt_brdo/mtg_jamendo_baseline/models/best_model.pth')
model.load_state_dict(S)

# inference
model.eval()
model = model.cuda()

In [10]:
print(my_mel.shape, audio.shape)

torch.Size([96, 1366]) torch.Size([96, 9602])


In [11]:
my_mel[:, 0]

tensor([ -68.7867,  -66.1242,  -64.2244,  -60.4956,  -64.7527,  -45.0095,
         -33.6146,  -30.6942,  -34.6825,  -45.1362,  -58.1798,  -61.0160,
         -53.6519,  -52.8619,  -60.4019,  -63.8609,  -49.7360,  -45.2003,
         -48.1572,  -53.9476,  -59.2078,  -65.1857,  -71.0518,  -67.2471,
         -59.2665,  -60.8922,  -67.2135,  -67.3332,  -76.5795,  -87.5193,
         -77.2087,  -74.1364,  -71.9692,  -61.4227,  -59.3152,  -65.9897,
         -80.8679,  -95.9277,  -86.9317,  -86.6582,  -90.3115,  -87.4826,
         -82.6069,  -86.3694,  -97.4053,  -82.9761,  -85.6829,  -90.4414,
         -88.1095, -100.4650,  -98.0056,  -99.5761,  -92.6636,  -90.8258,
         -95.0165,  -99.5775,  -99.0912,  -95.4591, -101.1901, -102.0378,
         -91.7656,  -89.2219,  -90.7442,  -98.7360, -110.1632,  -99.4069,
         -96.4395, -110.8693, -105.2903, -102.0307, -108.4406, -109.5085,
         -96.4408, -101.0703, -106.6503, -105.7251, -115.9824, -101.9469,
         -97.7877, -101.8304, -103.030

In [12]:
audio[:, 0]

tensor([-69.5358, -64.7463, -61.8604, -59.8808, -58.1119, -58.2752, -58.9025,
        -60.2660, -62.0527, -64.3706, -68.4771, -72.2208, -75.7047, -79.4953,
        -85.4376, -85.6893, -81.9504, -80.0834, -79.7122, -82.1272, -89.4751,
        -90.0000, -90.0000, -90.0000, -90.0000, -88.8482, -86.1220, -84.0110,
        -81.6328, -81.6245, -82.9754, -83.6547, -85.0630, -88.5137, -90.0000,
        -87.7471, -85.0853, -82.7995, -84.5712, -88.1776, -88.0879, -86.8838,
        -89.5533, -90.0000, -84.0632, -81.3411, -83.6548, -87.9001, -90.0000,
        -90.0000, -88.2064, -84.8365, -85.5288, -87.3742, -88.8410, -90.0000,
        -90.0000, -85.1121, -83.0755, -86.6247, -90.0000, -89.6840, -87.7929,
        -84.6036, -86.9026, -90.0000, -90.0000, -87.8175, -83.3707, -84.7766,
        -90.0000, -90.0000, -90.0000, -90.0000, -90.0000, -88.1323, -90.0000,
        -88.8589, -90.0000, -90.0000, -90.0000, -88.7473, -90.0000, -89.0149,
        -90.0000, -90.0000, -90.0000, -90.0000, -90.0000, -90.00

In [13]:
logits = model(my_mel.unsqueeze(0))
print(logits.shape)
logits

torch.Size([1, 56])


tensor([[0.0048, 0.0232, 0.0116, 0.0129, 0.0027, 0.0041, 0.0053, 0.0162, 0.0073,
         0.0163, 0.0115, 0.0197, 0.0021, 0.0122, 0.0055, 0.0043, 0.0399, 0.0115,
         0.1539, 0.0109, 0.0106, 0.0691, 0.0314, 0.0194, 0.1319, 0.0082, 0.1034,
         0.0013, 0.0033, 0.0023, 0.0136, 0.0295, 0.0034, 0.0080, 0.1580, 0.0050,
         0.0112, 0.0023, 0.0160, 0.0090, 0.0024, 0.0174, 0.0212, 0.0095, 0.0074,
         0.0071, 0.0156, 0.0020, 0.0046, 0.0143, 0.0073, 0.0054, 0.0013, 0.0084,
         0.0272, 0.0206]], device='cuda:0', grad_fn=<SigmoidBackward0>)