In [1]:
%matplotlib inline

import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
'''
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense
from keras.optimizers import Adam
'''
TRAINING_PATH = 'dataset/mirex_beat_tracking_2016/train/'

#### MIREX Audio Formats
- CD-quality (PCM, 16-bit, 44100 Hz)
- single channel (mono)
- file length between 2 and 36 seconds (total time: 14 minutes)

From experience,
- ```n_fft: # samples of a frame = 2048 samples@22050Hz, 4096 samples@44100Hz = 92.879818594104ms = frame duration```
- ```hop_length = n_fft * 3 // 4, that means overlap n_fft / 4 = 512 samples@22050Hz, 1024 samples@44100Hz = 23.219954648526 = overlap duration```

In [2]:
sr = 44100
n_fft = 4096
hop_length = n_fft * 3 // 4
frame_duration = np.ceil(n_fft * 1000 / sr) / 1000 # make 92.879818594104ms to be 93ms = 0.93sec
print("frame_duration = %f ms" % (frame_duration))

frame_duration = 0.093000 ms


- The number of frames (n_of_frames)<br>
 ```if 0 < hop_length <= n_fft,
 (n_of_frames - 1) * hop_length + n_fft = sr * the total time in second
 n_of_frames * hop_length - hop_length + n_fft = sr * the total time in second```

 - Formula : **```n_of_frames = (sr * the total time in second + hop_length - n_fft) / hop_length```**

In [3]:
# Verify the number of frames
def verify_number_of_frames(n_of_frames, y, sr, n_fft, hop_length):
    __total_time = int(np.ceil(len(y) / sr))
    __n_of_frames = int(np.ceil((sr * __total_time + hop_length - n_fft) / hop_length))
    if __n_of_frames != n_of_frames:
        print("%d != %d" % (n_of_frames, __n_of_frames))
        raise ValueError
    else:
        print('the number of frames is %d' % (n_of_frames))

#### LSTM
- Xin vector can be one kind of data listed below (perphas combination is possible, need to try)
 - music raw signals. dim = [n_fft, 1]
 - linear frequence spectrum. dim = [# of freq. bins, 1]
 - mel-frequence spectrum. dim = [???(1/2 * # of freq bins, 1]
 - onset strength envelope. dim = [???(should be the same as linear or mel, 1]
- Y vector is frames_beat. that indicates which frame has a beat.

In [4]:
'''
model = Sequential()
# build a LSTM RNN
model.add(LSTM(
    batch_input_shape=(BATCH_SIZE, TIME_STEPS, INPUT_SIZE), # Or: input_dim=INPUT_SIZE, input_length=TIME_STEPS,
    output_dim=CELL_SIZE,
    return_sequences=True,      # True: output at all steps. False: output as last step.
    stateful=True,              # True: the final state of batch1 is feed into the initial state of batch2
))
# add output layer
model.add(TimeDistributed(Dense(OUTPUT_SIZE)))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
'''

"\nmodel = Sequential()\n# build a LSTM RNN\nmodel.add(LSTM(\n    batch_input_shape=(BATCH_SIZE, TIME_STEPS, INPUT_SIZE), # Or: input_dim=INPUT_SIZE, input_length=TIME_STEPS,\n    output_dim=CELL_SIZE,\n    return_sequences=True,      # True: output at all steps. False: output as last step.\n    stateful=True,              # True: the final state of batch1 is feed into the initial state of batch2\n))\n# add output layer\nmodel.add(TimeDistributed(Dense(OUTPUT_SIZE)))\nmodel.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n"

In [5]:
#for i in range(1, 21):
for i in range(1, 2):
    train_beat_file = TRAINING_PATH + 'train%d.txt' % (i)
    train_audio_file = TRAINING_PATH + 'train%d.wav' % (i)

    audio_raw_data, sr = librosa.load(train_audio_file, sr=sr, mono=True) # default mono=True
    linear_freq = librosa.stft(y=audio_raw_data, n_fft=n_fft, hop_length=hop_length)
    n_of_frames = len(linear_freq[0, :])
    verify_number_of_frames(n_of_frames, y=audio_raw_data, sr=sr, n_fft=n_fft, hop_length=hop_length)

    beat_sequences = pd.read_table(train_beat_file, header=None, sep='\r', engine='python')
    # beat_sequences.shape = (40, 1) # each element is one beat sequence in seconds

    for ith_listener in range(len(beat_sequences)): # shape[0] is len() that means the total number of beat sequences
        beat_seq = beat_sequences[0][ith_listener] # the ith listener's beat sequence

        # step1. split into beats list. each element is a beat in seconds.
        beats_sec = [float(b) for b in beat_seq.split('\t')] # beats_sec = [0.625, 1.235, 1.740, ...]
        
        # step2. use beats_sec to indicate which frame has a beat.
        frames_idx = range(n_of_frames) # frames_idx = [0, 1, 2, ..., n_of_frames-1]
        frames_time = librosa.frames_to_time(frames_idx, sr=sr, hop_length=hop_length) # frames_time = [0, t1, t2, ...]
        frames_beat = [0] * n_of_frames # frames_beat = [0, 0, 0,...]
        start_fi = 0
        for beat_sec in beats_sec:
            for fi in frames_idx[start_fi:]:
                ti = frames_time[fi]
                if beat_sec < ti:
                    frames_beat[fi - 1] = 1
                    start_fi = fi
                    break # found, to do next beat
            else:
                if beat_sec < (frames_time[-1] + frame_duration): # check if the beat is within the last frame
                    frames_beat[-1] = 1
                else:
                    # not found, there must be someting wrong
                    raise ValueError
                '''
                # beat_sec < ti+frame_duration seems not good because it will be out of evaluation window(70ms)
                if beat_sec > ti and beat_sec < ti + frame_duration:
                    frames_beat[fi] = 1 # if the frame has a beat, its value was assigned to 1
                    if (fi + 1) < len(frames_idx) and beat_sec > frames_time[fi + 1]: # because frame overlap
                        continue
                    start_fi = fi + 1 # next searching starts from next frame
                    break # found, to do next beat
            else:
                # not found, there must be someting wrong
                raise ValueError
                '''

        # step3. Training
        # x_train = librosa.amplitude_to_db(linear_freq, ref=np.max)
        # y_train = frames_beat
        print(frames_beat)

        # then next listener's beat_sequence
    
    # then next training file

# start to predict

the number of frames is 431
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,