In [1]:
import glob
import os
import re

import librosa
import torch
import numpy as np

SEC_PER_MIN = 60

# Generating Lyric Label
Author: Jason Xu

    :format: name.label

In [2]:
def generate_lyrics_label(dev=False):
    if dev:
        path = os.getcwd() + '\\Model_Data\\dev\\lyrics'
    else:
        path = os.getcwd() + '\\Model_Data\\train\\lyrics'
    # train data
    for filename in glob.glob(os.path.join(path, '*.txt')):
        with open(filename, 'r') as r:
            lines = r.readlines()
            label_list = [0] * 2 * SEC_PER_MIN 
            for line in lines:
                timestamp = re.split(r'\[|\]', line)[1]
                time = re.split(r'[.:]', timestamp)
                # time format: [min, sec, minisec]
                if int(time[0]) < 2:
                    sec = int(time[1]) + int(time[0]) * SEC_PER_MIN 
                    label_list[sec] = 1
            with open(filename.split('.')[0] + ".label", 'w') as w:
                w.write(' '.join(str(s) for s in label_list))
                w.close()
        r.close()


# Train
generate_lyrics_label()
# Dev
generate_lyrics_label(dev=True)

# Zip .wav file with labels
Author: Jason Xu
    
    :output: 
        label_torch (tensor of lyric label)
        reg_wav_list (wav, sr) from librosa load
        iso_wav_list (wav, sr) from librosa load         

In [18]:
def zip_label_wav(dev=False):
    if dev:
        reg_dir = os.getcwd() + '\\Model_Data\\dev\\songs\\vocal_reg\\'
        iso_dir = os.getcwd() + '\\Model_Data\\dev\\songs\\vocal_iso\\'
        label_dir = os.getcwd() + '\\Model_Data\\dev\\lyrics\\'
    else:
        reg_dir = os.getcwd() + '\\Model_Data\\train\\songs\\vocal_reg\\'
        iso_dir = os.getcwd() + '\\Model_Data\\train\\songs\\vocal_iso\\'
        label_dir = os.getcwd() + '\\Model_Data\\train\\lyrics\\'
    reg_wav_list = []
    iso_wav_list = []
    label_list = []
    # .label file
    for filename in os.scandir(reg_dir):
        with open(label_dir + filename.name.split('.')[0] + '.label', 'r') as r:
            line = r.readline()
            label_list.append(str(line).split(' '))
        r.close()
    label_torch = np.array(label_list, dtype=np.float32)
    print(np.shape(label_torch))
    # .wav file
    for filename in os.scandir(reg_dir):
        print("Loading file:", filename.name)
        reg_wav, reg_sr = librosa.load(filename.path, duration=120)
        iso_wav, iso_sr = librosa.load(iso_dir + filename.name, duration=120)
        reg_wav_list.append((reg_wav, reg_sr))
        iso_wav_list.append((iso_wav, iso_sr))
    reg_wav_list = np.array(reg_wav_list)
    iso_wav_list = np.array(iso_wav_list)
    return label_torch, reg_wav_list, iso_wav_list

# train
print("===== Train Data =====")
label_torch, reg_wav_list, iso_wav_list = zip_label_wav()
# dev
print("===== Dev Data =====")
dev_label_torch, dev_reg_wav_list, dev_iso_wav_list = zip_label_wav(dev=True)

===== Train Data =====
(31, 120)
Loading file: ABoyNamedSue.wav
Loading file: AintNoGrave.wav
Loading file: AlohaOe.wav
Loading file: ASatisfiedMind.wav
Loading file: DannyBoy.wav
Loading file: Desperado.wav


# Model (FCNN)

Author: Andy Barbaro (main part), Jason Xu(prediction and eval)


In [114]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np


# Build the neural network for classifying pauses against nonpauses
class PauseNet1(nn.Module):
  def __init__(self, num_mfccs, hid1, hid2, out, dp=0.1):
    super().__init__()
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.model = nn.Sequential(
        nn.Linear(num_mfccs, hid1),
        nn.ReLU(),
        nn.Linear(hid1, hid2),
        nn.ReLU(),
        nn.Linear(hid2, out),
        nn.Sigmoid()
    )
    # self.fc1 = nn.Linear(num_mfccs, hid1)
    # torch.nn.init.xavier_uniform_(self.fc1.weight)
    # self.fc2 = nn.Linear(hid1, hid2)
    # torch.nn.init.xavier_uniform_(self.fc2.weight)
    # self.fc3 = nn.Linear(hid2, hid3)
    # torch.nn.init.xavier_uniform_(self.fc3.weight)
    # self.fc4 = nn.Linear(hid3, out)
    # torch.nn.init.xavier_uniform_(self.fc4.weight)

  # pass forward for nn
  def forward(self, x):
    return self.model(x)
    

#set parameters
num_epochs = 20
num_mfccs = 20
hidden1_size = 100
hidden2_size = 25
out_size = 1

# set other constants
sr = 22050
hop_length = 512
song_dur = 120

# create neural network
pause_net = PauseNet1(num_mfccs, hidden1_size, hidden2_size, out_size)
optimizer = optim.Adam(pause_net.parameters(), lr=0.001)
loss_func = nn.BCELoss()

print('===== Training Model =====')
for epoch in range(num_epochs):
    # Shuffle
    indices = np.arange(reg_wav_list.shape[0])
    np.random.shuffle(indices)
    reg_wav_list = reg_wav_list[indices]
    label_torch = label_torch[indices]
    total_loss = 0
    # Train data: label_torch, reg_wav_list, iso_wav_list
    for song_sr, labels in zip(reg_wav_list, label_torch):
        song = song_sr[0]
        # sample window of 2048 and hop size of 512 samples
        mfccs = librosa.feature.mfcc(y=song, n_mfcc=num_mfccs) #(num_mfccs, 5168)

        #breaks down mfccs into their time intervals
        audio_length = len(song) / sr # in seconds
        step = hop_length / sr # in seconds
        intervals_s = np.arange(0, audio_length, step)

        # get each second and sample
        for i, label in enumerate(labels):
            #isolate a single second
            sec_interval = np.where(intervals_s.astype(int) == i)[0]
            # indexes a single second of song from MFCCs
            song_sec = np.take(mfccs, sec_interval, axis=1)

            #average the values over a second
            inp = torch.from_numpy(np.mean(song_sec, axis=1))

            label_tensor = torch.tensor([label])
            
            optimizer.zero_grad()
            pred = pause_net.forward(inp)
            #print(pred)

            # nonpause proper label is 0, pause proper label is 1
            loss = loss_func(pred, label_tensor)
            total_loss += loss

            loss.backward()
            optimizer.step()

    print("Epoch %i Total Loss: %.3f" % (epoch, total_loss))

===== Training Model =====
Epoch 0 Total Loss: 1831.873
Epoch 1 Total Loss: 1735.390
Epoch 2 Total Loss: 1731.908
Epoch 3 Total Loss: 1725.466
Epoch 4 Total Loss: 1723.628
Epoch 5 Total Loss: 1718.118
Epoch 6 Total Loss: 1712.236
Epoch 7 Total Loss: 1711.441
Epoch 8 Total Loss: 1698.585
Epoch 9 Total Loss: 1703.173
Epoch 10 Total Loss: 1696.295
Epoch 11 Total Loss: 1692.317
Epoch 12 Total Loss: 1681.115
Epoch 13 Total Loss: 1684.543
Epoch 14 Total Loss: 1673.011
Epoch 15 Total Loss: 1668.343
Epoch 16 Total Loss: 1670.783
Epoch 17 Total Loss: 1657.503
Epoch 18 Total Loss: 1647.034
Epoch 19 Total Loss: 1659.635


In [132]:
print('===== Evaluating =====')
# Dev Data: dev_label_torch, dev_reg_wav_list, dev_iso_wav_list 
pred_seq = []
for songs, labels in zip(dev_reg_wav_list, dev_label_torch):
    song = songs[0]
    # sample window of 2048 and hop size of 512 samples
    mfccs = librosa.feature.mfcc(y=song, n_mfcc=num_mfccs) #(num_mfccs, 5168)

    #breaks down mfccs into their time intervals
    audio_length = len(song) / sr # in seconds
    step = hop_length / sr # in seconds
    intervals_s = np.arange(0, audio_length, step)
    pred_list = []
    # get each second and sample
    for i, __ in enumerate(labels):
        #isolate a single second
        sec_interval = np.where(intervals_s.astype(int) == i)[0]
        # indexes a single second of song from MFCCs
        song_sec = np.take(mfccs, sec_interval, axis=1)

        #average the values over a second
        inp = torch.from_numpy(np.mean(song_sec, axis=1))
        
        pred = pause_net.forward(inp)
        #print(pred)
        pred = 1 if pred > 0.22 else 0
        pred_list.append(pred)
    pred_seq.append(pred_list)
labels = np.array(pred_seq)
# Eval
for predictions, golds in zip(labels, dev_label_torch):
    num_correct = 0
    num_pos_correct = 0
    num_pred = 0
    num_gold = 0
    num_total = 0
    if len(golds) != len(predictions):
        raise Exception("Mismatched gold/pred lengths: %i / %i" % (len(golds), len(predictions)))
    for idx in range(0, len(golds)):
        gold = golds[idx]
        prediction = predictions[idx]
        if prediction == gold:
            num_correct += 1
        if prediction == 1:
            num_pred += 1
        if gold == 1:
            num_gold += 1
        if prediction == 1 and gold == 1:
            num_pos_correct += 1
        num_total += 1
    acc = float(num_correct) / num_total
    output_str = "Accuracy: %i / %i = %f" % (num_correct, num_total, acc)
    prec = float(num_pos_correct) / num_pred if num_pred > 0 else 0.0
    rec = float(num_pos_correct) / num_gold if num_gold > 0 else 0.0
    f1 = 2 * prec * rec / (prec + rec) if prec > 0 and rec > 0 else 0.0
    output_str += ";\nPrecision (fraction of predicted positives that are correct): %i / %i = %f" % (num_pos_correct, num_pred, prec)
    output_str += ";\nRecall (fraction of true positives predicted correctly): %i / %i = %f" % (num_pos_correct, num_gold, rec)
    output_str += ";\nF1 (harmonic mean of precision and recall): %f;\n" % f1
    print(output_str)

===== Evaluating =====
Accuracy: 65 / 120 = 0.541667;
Precision (fraction of predicted positives that are correct): 13 / 56 = 0.232143;
Recall (fraction of true positives predicted correctly): 13 / 25 = 0.520000;
F1 (harmonic mean of precision and recall): 0.320988;

Accuracy: 88 / 120 = 0.733333;
Precision (fraction of predicted positives that are correct): 4 / 16 = 0.250000;
Recall (fraction of true positives predicted correctly): 4 / 24 = 0.166667;
F1 (harmonic mean of precision and recall): 0.200000;

Accuracy: 59 / 120 = 0.491667;
Precision (fraction of predicted positives that are correct): 30 / 88 = 0.340909;
Recall (fraction of true positives predicted correctly): 30 / 33 = 0.909091;
F1 (harmonic mean of precision and recall): 0.495868;

Accuracy: 94 / 120 = 0.783333;
Precision (fraction of predicted positives that are correct): 4 / 17 = 0.235294;
Recall (fraction of true positives predicted correctly): 4 / 17 = 0.235294;
F1 (harmonic mean of precision and recall): 0.235294;

A

# FCNN model

Author: Andy Barbaro

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np
import random

# Build the neural network for classifying pauses against nonpauses
class PauseNet1(nn.Module):
  def __init__(self, num_mfccs, hid, out):
    super().__init__()
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.fc1 = nn.Linear(num_mfccs, hid)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    self.fc2 = nn.Linear(hid, hid)
    torch.nn.init.xavier_uniform_(self.fc2.weight)
    self.fc3 = nn.Linear(hid, hid)
    torch.nn.init.xavier_uniform_(self.fc3.weight)
    self.fc4 = nn.Linear(hid, hid)
    torch.nn.init.xavier_uniform_(self.fc4.weight)
    self.fc5 = nn.Linear(hid, out)
    torch.nn.init.xavier_uniform_(self.fc5.weight)


  # pass forward for nn
  def forward(self, data):
    x=self.relu(self.fc1(data))
    x=self.relu(self.fc2(x))
    x=self.relu(self.fc3(x))
    x=self.relu(self.fc4(x))
    x=self.sigmoid(self.fc5(x))

    return x


if __name__ == '__main__':
    #set parameters
    num_epochs = 5
    num_mfccs = 13
    hidden_size = 30
    out_size = 1

    # set other constants
    sr = 22050
    hop_length = 512
    song_dur = 120

    # create neural network
    pause_net = PauseNet1(num_mfccs, hidden_size, out_size)
    optimizer = optim.Adam(pause_net.parameters(), lr=0.001)
    loss_func = nn.BCELoss()


    # Pack the label and .wav file
    # iso_labels, iso_wavs, reg_wavs = zip_label_wav()
    iso_labels = label_torch
    iso_wavs = iso_wav_list
    song_idxs = list(range(len(iso_labels)))

    for epoch in range(num_epochs):
        #shuffle songs for training each epoch to prevent overfitting
        random.shuffle(song_idxs)

        total_loss = 0
        for i in song_idxs:
            song = iso_wavs[i][0]
            label = iso_labels[i]

            # sample window of 2048 and hop size of 512 samples
            mfccs = librosa.feature.mfcc(y=song, n_mfcc=num_mfccs) #(num_mfccs, 5168)
            mfccs = torch.FloatTensor(mfccs.T)

            #breaks down mfccs into their time intervals
            audio_length = len(song) / sr # in seconds
            step = hop_length / sr # in seconds
            intervals_s = np.arange(0, audio_length, step)


            optimizer.zero_grad()
            pred = pause_net.forward(mfccs)
            pred = torch.squeeze(pred, -1)

            # nonpause proper label is 0, pause proper label is 1
            loss = loss_func(pred, label)
            total_loss += loss

            loss.backward()
            optimizer.step()

        print("Epoch %i Total Loss: %.3f" % (epoch, total_loss))

(31, 120)
Loading file: ABoyNamedSue.wav
Loading file: AintNoGrave.wav
Loading file: AlohaOe.wav
Loading file: ASatisfiedMind.wav
Loading file: DannyBoy.wav
Loading file: Desperado.wav
Loading file: FatherAndSon.wav
Loading file: FurtherOnUpTheRoad.wav
Loading file: GhostRidersInTheSky.wav
Loading file: GirlFromTheNorthCountry.wav
Loading file: GiveMyLoveToRose.wav
Loading file: Hurt.wav
Loading file: IDontHurtAnymore.wav
Loading file: IHeardThatLonesomeWhistle.wav
Loading file: InMyLife.wav
Loading file: IWontBackDown.wav
Loading file: ManInBlack.wav
Loading file: One.wav
Loading file: OutAmongTheStars.wav
Loading file: PersonalJesus.wav
Loading file: RingOfFire.wav
Loading file: SheUsedToLoveMeALot.wav
Loading file: SolitaryMan.wav
Loading file: TheGeneralLee.wav
Loading file: TheGettysburgAddress.wav
Loading file: TheLegendOfJohnHenrysHammer.wav
Loading file: TheManComesAround.wav
Loading file: TheWanderer.wav
Loading file: WayfaringStranger.wav
Loading file: WellMeetAgain.wav
Loadi

  reg_wav_list = np.array(reg_wav_list)
  iso_wav_list = np.array(iso_wav_list)


ParameterError: Audio data must be floating-point