In [1]:
from Preprocessing.dataloader import *
from torch.utils.data import DataLoader

speed_volume_perturb=False
spec_augment=False
sr = 16000
audio_config = dict(sample_rate=sr,
                          window_size=.02,
                          window_stride=0.01,
                          window='hamming',
                          noise_dir=None,
                          noise_prob=0.4,
                          noise_levels=(0.0, 0.5))
directory = './Data/Hip-Hop/'
out_dir = './Output'
pos_dir = './Data/spectrogram_pos'
neg_dir = None

# save npy files 
if not path.exists(pos_dir):
    os.mkdir(pos_dir)
    if path.exists(out_dir):
        ! rm -r $out_dir
    os.mkdir(out_dir)
    sp = SpectrogramParser(audio_config)
    index = 0
    for file in listdir(directory):
        filepath = path.join(directory, file)
        for i in range(5):
            spect,time = sp.parse_audio(filepath)
            name, wav = file.split('.')
            outfile = path.join(out_dir,name+'_{:.2f}'.format(time)+'.'+wav)
            sp.spectrum2wav(spect, sr, outfile)
            specfile = path.join(pos_dir, str(index)+'_'+name+'_{:.2f}'.format(time))
            np.save(specfile,spect)
            index +=1
    # 16000 465984

train_dataset = SpectrogramDataset(audio_conf=audio_config,
                                   pos_dir=pos_dir,
                                   neg_dir=neg_dir,
                                   normalize=False,
                                   speed_volume_perturb=speed_volume_perturb,
                                   spec_augment=spec_augment)
dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None)



In [2]:
def contextTraining(out_target, input_variable, input_sizes, rEncoder, epochs, criterion,optimizer):
    rEncoder.eval()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out_variable = rEncoder(input_variable, input_sizes)
        #print(out_variable[0][0][0][:5])
        loss = criterion(out_variable, out_target)
        loss = loss / input_target.size(0)  # average the loss by minibatch
        loss_value = loss.item()
        print("epoch:{}".format(epoch),loss_value)
        loss.backward()
        print(out_variable.grad, out_target.grad)
        optimizer.step()
            
def load_model(path):
    print("Loading state from model %s" % path)
    package = torch.load(path, map_location=lambda storage, loc: storage)
    model = DeepSpeech(audio_conf=package['audio_conf'])
    model.load_state_dict(package['state_dict'], strict=False)
    return model

In [6]:
from RecognitionEncoder import DeepSpeech
from torch.autograd import Variable
import torch
from torch.nn import MSELoss
from torch.optim import Adam


# load model parameters
device = torch.device("cuda")
rEncoder = load_model('../deepspeech/librispeech_pretrained_v2.pth')
rEncoder = rEncoder.to(device)

rEncoder_target = load_model('../deepspeech/librispeech_pretrained_v2.pth')
rEncoder_target = rEncoder_target.to(device)
print(rEncoder.audio_conf)

# TODO hyperparameters
minibatch_size = 1
freq_size = train_dataset.sample_size[0]
max_seqlength = train_dataset.sample_size[1]
lengths = max_seqlength
input_sizes = torch.LongTensor([lengths]*minibatch_size)

criterion = MSELoss()

for i, (data) in enumerate(dataloader):
    pos_sample, neg_sample = data
    # 1, 161, 1001
    input_target = torch.Tensor(pos_sample).unsqueeze(0).to(device)
    input_variable = Variable(torch.randn(input_target.shape).cuda(),requires_grad=True)
    optimizer = Adam([input_variable],lr=0.0001)
    
    out_target = Variable(rEncoder_target(input_target, input_sizes))
    contextTraining(out_target, input_variable, input_sizes, rEncoder, 5, criterion, optimizer)
    break

Loading state from model ../deepspeech/librispeech_pretrained_v2.pth
Loading state from model ../deepspeech/librispeech_pretrained_v2.pth
{'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'window': 'hamming', 'noise_dir': None, 'noise_prob': 0.4, 'noise_levels': (0.0, 0.5)}
epoch:0 0.36983489990234375
None None
epoch:1 0.36969903111457825
None None
epoch:2 0.3695633113384247
None None
epoch:3 0.3694276511669159
None None
epoch:4 0.36929211020469666
None None
