In [1]:
import glob
import os
import re

import librosa
import torch
import numpy as np

SEC_PER_MIN = 60

# Generating Lyric Label
    
    :format: name.label

In [2]:
def generate_lyrics_label():
    for filename in glob.glob(os.path.join(os.getcwd() + '\Model_Data\lyrics', '*.txt')):
        with open(filename, 'r') as r:
            lines = r.readlines()
            label_list = [0] * 2 * SEC_PER_MIN 
            for line in lines:
                timestamp = re.split(r'\[|\]', line)[1]
                time = re.split(r'[.:]', timestamp)
                # time format: [min, sec, minisec]
                if int(time[0]) < 2:
                    sec = int(time[1]) + int(time[0]) * SEC_PER_MIN 
                    label_list[sec] = 1
            with open(filename.split('.')[0] + ".label", 'w') as w:
                w.write(' '.join(str(s) for s in label_list))
                w.close()
        r.close()

generate_lyrics_label()

# Zip .wav file with labels
    
    :output: 
        label_torch (tensor of lyric label)
        reg_wav_list (wav, sr) from librosa load
        iso_wav_list (wav, sr) from librosa load         

In [3]:
def zip_label_wav():
    reg_dir = os.getcwd() + '/Model_Data/songs/vocal_reg/'
    iso_dir = os.getcwd() + '/Model_Data/songs/vocal_iso/'
    label_dir = os.getcwd() + '/Model_Data/lyrics/'
    reg_wav_list = []
    iso_wav_list = []
    label_list = []
    # .label file
    for filename in os.scandir(reg_dir):
        with open(label_dir + filename.name.split('.')[0] + '.label', 'r') as r:
            line = r.readline()
            label_list.append(str(line).split(' '))
        r.close()
    label_torch = torch.from_numpy(np.array(label_list, dtype=np.float32))
    print(label_torch.size())
    # .wav file
    for filename in os.scandir(reg_dir):
        print("Loading file:", filename.name)
        reg_wav, reg_sr = librosa.load(filename.path, duration=120)
        iso_wav, iso_sr = librosa.load(iso_dir + filename.name, duration=120)
        reg_wav_list.append((reg_wav, reg_sr))
        iso_wav_list.append((iso_wav, iso_sr))
    
    return label_torch, reg_wav_list, iso_wav_list

label_torch, reg_wav_list, iso_wav_list = zip_label_wav()

torch.Size([38, 120])
Loading file: ABoyNamedSue.wav
Loading file: AintNoGrave.wav
Loading file: AlohaOe.wav
Loading file: ASatisfiedMind.wav
Loading file: BlueSuedeShoes.wav
Loading file: DannyBoy.wav
Loading file: Desperado.wav
Loading file: FatherAndSon.wav
Loading file: FolsomPrisonBlues.wav
Loading file: FurtherOnUpTheRoad.wav
Loading file: GhostRidersInTheSky.wav
Loading file: GirlFromTheNorthCountry.wav
Loading file: GiveMyLoveToRose.wav
Loading file: GodsGonnaCutYouDown.wav
Loading file: HeartOfGold.wav
Loading file: Highwayman..wav
Loading file: Hurt.wav
Loading file: IDontHurtAnymore.wav
Loading file: IHeardThatLonesomeWhistle.wav
Loading file: InMyLife.wav
Loading file: IWontBackDown.wav
Loading file: ManInBlack.wav
Loading file: OhLonesomeMe.wav
Loading file: One.wav
Loading file: OutAmongTheStars.wav
Loading file: PersonalJesus.wav
Loading file: RedemptionSong.wav
Loading file: RingOfFire.wav
Loading file: SheUsedToLoveMeALot.wav
Loading file: SolitaryMan.wav
Loading file:

# Note for Andy

for label_torch, the size is (38, 120)
if you need [1, 120] tensor each time, just do
label_torch[i].unsqueeze(0)

In [12]:
# import standard PyTorch modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np


# Build the neural network for classifying pauses against nonpauses
class PauseNet1(nn.Module):
  def __init__(self, num_mfccs, hid1, hid2, out):
    super().__init__()
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.fc1 = nn.Linear(num_mfccs, hid1)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    self.fc2 = nn.Linear(hid1, hid2)
    torch.nn.init.xavier_uniform_(self.fc2.weight)
    self.fc3 = nn.Linear(hid2, out)
    torch.nn.init.xavier_uniform_(self.fc3.weight)


  # pass forward for nn
  def forward(self, x):
    x=self.relu(self.fc1(x))
    x=self.relu(self.fc2(x))
    x=self.sigmoid(self.fc3(x))

    return x


if __name__ == '__main__':
    #set parameters
    num_epochs = 30
    num_mfccs = 20
    hidden1_size = 100
    hidden2_size = 25
    out_size = 1

    # set other constants
    sr = 22050
    hop_length = 512
    song_dur = 120

    # create neural network
    pause_net = PauseNet1(num_mfccs, hidden1_size, hidden2_size, out_size)
    optimizer = optim.Adam(pause_net.parameters(), lr=0.0005)
    loss_func = nn.BCELoss()

    # # get data
    # generate_lyrics_label()
    # # Pack the label and .wav file
    # iso_labels, iso_wavs, reg_wavs = zip_label_wav()
    # reg_wav = np.array(reg_wav_list)
    # iso_wav = np.array(iso_wav_list)
    for epoch in range(num_epochs):
        total_loss = 0
        for song_sr, labels in zip(reg_wav_list, label_torch):
            song = song_sr[0]
            # sample window of 2048 and hop size of 512 samples
            mfccs = librosa.feature.mfcc(y=song, n_mfcc=num_mfccs) #(num_mfccs, 5168)

            #breaks down mfccs into their time intervals
            audio_length = len(song) / sr # in seconds
            step = hop_length / sr # in seconds
            intervals_s = np.arange(0, audio_length, step)

            # get each second and sample
            for label in labels:
                #allows me to easily find the cutoff
                intervals_s -= 1
                #isolate a single second
                sec_interval = np.where(intervals_s < 0)[0]
                # indexes a single second of song from MFCCs
                song_sec = np.take(mfccs, sec_interval, axis=1)

                #average the values over a second
                inp = torch.from_numpy(np.mean(song_sec, axis=1))

                label_tensor = torch.tensor([label])

                optimizer.zero_grad()
                pred = pause_net.forward(inp)

                # nonpause proper label is 0, pause proper label is 1
                loss = loss_func(pred, label_tensor)
                total_loss += loss

                loss.backward()
                optimizer.step()

        print("Epoch %i Total Loss: %.3f" % (epoch, total_loss))

Epoch 0 Total Loss: 301321.531
Epoch 1 Total Loss: 2357.749
Epoch 2 Total Loss: 2219.437
Epoch 3 Total Loss: 2130.725
Epoch 4 Total Loss: 2115.577
Epoch 5 Total Loss: 2111.111
Epoch 6 Total Loss: 2104.028
Epoch 7 Total Loss: 2103.855
