In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.utils.data
import torch.optim as optim
import torch.nn.functional as F
import os
import math


from torch import nn
from pydub import AudioSegment
from scipy.signal import detrend

In [2]:
sampling_rate = 4096

chunk = 20
chunk_samples_count = sampling_rate // chunk

In [3]:
# magnitude extractor
def magnitude_extractor(x, N):
    return 20 * np.log10(np.sqrt(np.real(x) ** 2 + np.imag(x) ** 2) / chunk_samples_count + 1)

def read_audio_source(file_name):
    raw = AudioSegment.from_mp3("dataset\\xx\\" + file_name)
    raw = raw.set_channels(1)
    raw = raw.set_frame_rate(sampling_rate)

    data = raw.get_array_of_samples()
    data = np.array(data)

    chunk_count = len(data) // chunk_samples_count

    data = data[:chunk_count * chunk_samples_count].reshape(-1, chunk_samples_count)

    # kaiser window for reducing spectral leakage
    window = np.kaiser(chunk_samples_count, beta = 14)

    data = detrend(data, axis = 1) * window
    
    N = chunk_samples_count // 2

    data = magnitude_extractor(np.fft.fft(data)[:, :N], 1)
    bins = np.fft.fftfreq(chunk_samples_count, 1 / sampling_rate)[:N]
    
    return (data, bins)
    
def chunk_plot(x_input, y_input):
    fig = plt.figure(figsize = (25, 4))
    
    for index, input in enumerate(y_input):
        ax = fig.add_subplot(1, 6, index + 1)
        ax.plot(x_input, input)
        ax.set_xlabel('Frequency (kHz)')
        ax.set_ylabel('Power (dB)')

In [4]:
class BandBuilder:
    message = "Band %d - from %.3f hz to %.3f hz"
    
    def __init__(self, nyquist_freq = 4096, offset_freq = 0, offset_log = 14, base = 1.415):
        # offset frequency because logarithm derivative is low at the start
        self.offset_freq = offset_freq
        
        # offset logarithm
        self.offset_log = offset_log
        
        # logarithm base
        self.base = base
        
        # band count
        self.size = self.get_count(nyquist_freq)
        
        # offset value
        self.offset_value = self.offset_freq + self.base ** self.offset_log
        
        # store band' frequency coverage
        self.bands = [ self.offset_freq + self.base ** (x + self.offset_log) for x in range(self.size + 1) ]
        
    def get_count(self, nyquist_freq):
        remaining_freq = nyquist_freq - self.offset_freq
        
        return math.ceil(math.log(remaining_freq, self.base)) - self.offset_log
        
        
    def show(self):
        prov = 0
        
        for index, band in enumerate(self.bands):
            print(self.message % (index, prov, band))
            
            prov = band
        
    def get_band(self, value):
        if value > self.offset_value:
            return math.ceil(math.log(value, self.base) - self.offset_log)
        else: return 0
        
    def get_band_value(self, value):
        return self.base ** (self.offset_log + value) + self.offset_freq
        
# Band class with index within the logarithmic bands and its maximum amplitude
class Band:
    def __init__(self, index, mag):
        self.index = index
        self.mag = mag
                
# band_builder = BandBuilder(2048, 0, offset_log = 16, base = 1.285)
# band_builder.show()

In [5]:
class MaxBands:
    def __init__(self, data, band_builder):
        # dft data 
        self.data = data
        
        # band builder
        self.band_builder = band_builder
        
    # we compute the strongest bins for the bands
    def get_bands(self):
        result = np.zeros(self.data.shape[0] * self.band_builder.size).reshape(self.data.shape[0], -1)

        for band, sample in enumerate(self.data):
            for index, value in enumerate(sample):
                current_band = self.band_builder.get_band(int(index * 2049 / 102))

                if(current_band < self.band_builder.size):
                    result[band][current_band] = max(result[band][current_band], value)
                    
        
        return result

    def show_band(self, index):
        max_bands = self.get_bands()
        
        values = max_bands[index]
        for index, mag in enumerate(values):
            print("Band %d with val: %d" % (self.band_builder.get_band_value(index), mag))

In [6]:
for file in os.listdir("dataset\\xx"):
    if(file.startswith("104 Islands")):
        data, bins = read_audio_source(file)
        
        band_builder = BandBuilder(2048, 0, offset_log = 16, base = 1.285)
        max_bands = MaxBands(data, band_builder)
        mag_bands = max_bands.show_band(2000)

Band 55 with val: 52
Band 71 with val: 56
Band 91 with val: 56
Band 117 with val: 54
Band 150 with val: 63
Band 193 with val: 66
Band 248 with val: 63
Band 319 with val: 52
Band 410 with val: 51
Band 527 with val: 50
Band 678 with val: 39
Band 871 with val: 39
Band 1120 with val: 31
Band 1439 with val: 30
Band 1849 with val: 28


In [7]:
# roughly approximation for a letter in time domain
spacing_letter = 20

# same for blank
spacing_blank = 40

time = [60, 1]
def timestr_to_ms(input):
    values = input.split(":")
    
    return sum([ int(value) * time[index] for index, value in enumerate(values) ])

def interval_to_timestamp(input):
    start = timestr_to_ms(input['s'])
    end = timestr_to_ms(input['e'])
    return (start, end, (end - start) * 1000)

def find_occurrences(s, ch):
    return len([i for i, letter in enumerate(s) if letter == ch])

def get_spacing(is_space, spacing_time):
    if(is_space):
        return spacing_time * 2
    
    return spacing_time

def string_to_letters(input, data):
    lyrics = input["d"]
    
    start, end, time = interval_to_timestamp(input)

    blanks = find_occurrences(lyrics, " ")
    spacing_lyrics = blanks * spacing_blank + (len(lyrics) - blanks) * spacing_letter
    spacing_time = time // spacing_lyrics

    it = int(start * spacing_letter)
    for letter in lyrics:
        is_space = letter == " "
        for i in range(get_spacing(is_space, spacing_time)):
            if(is_space):
                data[it] = 0
            else: data[it] = ord(letter) - 96
            
            it += 1

In [55]:
lyrics = pd.read_json("dataset\\lyrics.json", typ = "series", orient = "records")

def get_train_data():
    band_builder = BandBuilder(2048, 0, offset_log = 16, base = 1.285)
    
    input_x = []
    input_y = []
    
    for track in lyrics:
        track_name = track["track"]
        
        data, bins = read_audio_source(track_name + ".mp3")
        
        input_x.append(np.array(data, dtype = np.float32))
        
        data = np.zeros(data.shape[0])
        
        track_data = track["data"]
        for data_stamp in track_data:
            string_to_letters(data_stamp, data)
            
        if(max(data) > 27 or min(data) < 0):
            raise ValueError("Inconsistent data, contains characters exceeding the alphabet range and space - {}".format(track_name))
            
        input_y.append(np.array(data, dtype = np.float32))
        
    return (np.concatenate(input_x), np.concatenate(input_y))
            
features, targets = get_train_data()

In [62]:
size = len(features)

batch_size_t = 250
batch_count = size // batch_size_t 

remaining_samples = batch_count * batch_size_t

test_threshold = batch_count // 9
train_threshold = batch_count

test_threshold *= batch_size_t
train_threshold *= batch_size_t

train_features, train_targets = features[test_threshold:train_threshold], targets[test_threshold:train_threshold]
test_features, test_targets = features[:test_threshold], targets[:test_threshold]

train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_features), torch.from_numpy(train_targets))
test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(test_features), torch.from_numpy(test_targets))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size_t, shuffle = False, num_workers = 4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size_t, shuffle = False, num_workers = 4)

In [83]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        
        self.rnn = nn.RNN(102, 102, 10, batch_first = True)
        
        self.fc1 = nn.Linear(102, 64)
        self.fc2 = nn.Linear(64, 27)
        
        self.dropout = nn.Dropout(p = 0.2)
        
    def forward(self, x, hidden):
        r_out, hidden = self.rnn(x.view(250, 1, -1), hidden)
        
        # sequence of fully connected layers
        x = self.dropout(F.relu(self.fc1(r_out)))
        x = self.dropout(F.relu(self.fc2(x)))
        
        x = F.log_softmax(x, dim = 1)
        
        return x, hidden

# our model
model = Network()

# loss function
criterion = nn.NLLLoss()

# optimizer
optimizer = optim.ASGD(model.parameters(), lr = 0.01)

In [84]:
features, targets = next(iter(test_loader))

output, hidden = model(features, None)

loss = criterion(output.view(250, -1), targets.type(dtype = torch.LongTensor))
# we have to get 27 class probabilities
print(targets)
print(features)
print(output)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [66]:
# number of epochs for model' training
epochs_count = 10

test_loss_min = np.Inf

for epoch in range(1, epochs_count + 1):
    
    train_loss = 0.0
    test_loss = 0.0
    
    hidden_train = None
    hidden_test = None
    
    model.train()
    for features, targets in train_loader:
        optimizer.zero_grad()
    
        # forward pass
        output, hidden_train = model(features, hidden_train)
        
        hidden_train = hidden_train.data
        
        loss = criterion(output.view(250, -1), targets.type(dtype = torch.LongTensor))
        
        # backwards pass
        loss.backward()
        
        optimizer.step()
        
        # update training loss
        train_loss += loss.item() * features.size(0)
        
    hidden_test = None

    model.eval()
    for features, targets in test_loader:
        # forward pass
        output, hidden_test = model(features, hidden_test)
        
        hidden_test = hidden_test.data
        
        # calculate batch loss
        loss = criterion(output.view(250, -1), targets.type(dtype = torch.LongTensor))
        
        # update validation loss 
        test_loss += loss.item() * features.size(0)
    
    # calculate average losses
    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, test_loss))
    
    # save model if validation loss has decreased
    if test_loss <= test_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        test_loss_min,
        test_loss))
        
        torch.save(model.state_dict(), 'lyrics.pt')
        test_loss_min = test_loss

Epoch: 1 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (inf --> 0.000000).  Saving model ...
Epoch: 2 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 3 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 4 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 5 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 6 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 7 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
Epoch: 8 	Training Loss: 0.000000 	Validation Loss: 0.000000
Validation loss decreased (0.00000

TypeError: a bytes-like object is required, not 'Tensor'