In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from STSSN.synthesizer import Synthesizer
from STSSN.contentEncoder import ContentEncoder
from STSSN.LibriStyleDataset import LibriStyle

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
_model = None # type: VoiceImpersonator
_hparams = None
_device = None
_use_cuda = None
_train_audio_transforms = None
_valid_audio_transforms = None

In [3]:
def load_models(model_path=None, save_model_path=None, batch_size=8, epochs=1):
    global _model, _device, _hparams, _use_cuda
    _use_cuda = torch.cuda.is_available()
    _device = torch.device("cuda" if _use_cuda else "cpu")
    warm_start=True

    _hparams = {
        "n_cnn_layers": 2,
        "n_rnn_layers": 4,
        "rnn_dim": 256,
        "n_feats": 80,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": 5e-4,
        "batch_size": batch_size,
        "epochs": epochs,
        "save_model_path": save_model_path
    }

    _model = VoiceImpersonator(
    _hparams['n_cnn_layers'], _hparams['n_rnn_layers'], _hparams['rnn_dim'],
    _hparams['stride'], _hparams['dropout']
    ).to(_device)

    if model_path is not None:
        _model.load_state_dict(torch.load(model_path), strict=False)

    print(_model)
    print('Num Model Parameters', sum([param.nelement() for param in _model.parameters()]))
    load_transforms()

In [4]:
# Merge encoder and synthesizer into full network
class VoiceImpersonator(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, stride=2, dropout=0.1):
        super().__init__()
        
        self.encoder=ContentEncoder(
            _hparams['n_cnn_layers'], _hparams['n_rnn_layers'], _hparams['rnn_dim'],
            _hparams['stride'], _hparams['n_feats'], _hparams['dropout']
            )
                
        self.synthesizer=Synthesizer()
        
    def forward(self, spectrograms, styles):
#         # Input is a 2 element list, containing a batch of spectrograms and a batch of style embeddings
#         # Extract style embeddings from input tensor, remove from list
#         embeds = x[1]
#         x = x[0]
        
        # Compute content encoding
        x = self.encoder(spectrograms)
        
        styleEmbeds = torch.empty((_hparams['batch_size'], x.shape[1], 256)).to(_device)
        
        for i in (0, styles.shape[0]-1):
                styleEmbeds[i,:,:]=styles[i,:].repeat(1,x.shape[1],1)
    
        # Concatenate content with style embedding, now size (batch, timesteps, 768)
        x = torch.cat((x, styleEmbeds), 2)
        
        print("Embedding Shape")
        print(x.shape)
        
        # Synthesize a spectrogram from the combined embeddings
        x = self.synthesizer(x)
        
        return x

In [5]:
# Manage data preprocessing (creating spectrograms, transforms, etc)
def load_transforms():
    global _train_audio_transforms, _valid_audio_transforms
    _train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
    )

    _valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80)
    
def data_processing(data, data_type="train"):
    global _train_audio_transforms, _valid_audio_transforms
    spectrograms = []
    styles = []

    # waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id
    for (c, (wav, embed)) in enumerate(data, 1):
        if data_type == 'train':
            spec = _train_audio_transforms(wav).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = _valid_audio_transforms(wav).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
            
        spectrograms.append(spec)
        styles.append(np.expand_dims(embed, 1))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    styles = torch.from_numpy(np.concatenate(styles, 1)).transpose(0,1)
    
    return spectrograms, styles

In [6]:
# Define Training Behavior       
def train():
    global _model, _device, _hparams, _use_cuda

    if _model is None:
        raise Exception("Model was not loaded, call load_model() before training")

    dataset_dir = os.path.expanduser("~/dev/datasets/Libri")
    if not os.path.isdir(dataset_dir):
        os.makedirs(dataset_dir)

    train_url="train-clean-100"
    test_url="test-clean"

    kwargs = {'num_workers': 1, 'pin_memory': True} if _use_cuda else {}
    
    train_dataset = LibriStyle(dataset_dir, url=train_url, download=True, preprocess=False)
    train_loader = data.DataLoader(dataset=train_dataset, batch_size=_hparams['batch_size'], shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    
    test_dataset = LibriStyle(dataset_dir, url=test_url, download=True, preprocess=False)
    test_loader = data.DataLoader(dataset=test_dataset, batch_size=_hparams['batch_size'], shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)


    optimizer = optim.AdamW(_model.parameters(), _hparams['learning_rate'])

    # This is a pretty poor loss function for now
    lossFunction = nn.MSELoss()      
                             
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=_hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=_hparams['epochs'],
                                            anneal_strategy='linear')       
    
    _model.train()
    data_len = len(train_loader.dataset)
    for epoch in range(1, _hparams['epochs'] + 1):
        for batch_idx, _data in enumerate(train_loader):
            spectrograms, styles = _data

            spectrograms = spectrograms.to(_device)
            styles = styles.to(_device)
            
            print("Input Spectrograms Shape")
            print(spectrograms.shape)
            
            optimizer.zero_grad()

            output = _model(spectrograms, styles)  # (batch, timesteps, frequencies)
            
            print("Output Spectrograms Shape")
            print(output.shape)
#             print(spectrograms.shape)
#             print(styles.shape)
            
#             loss = lossFunction(spectrograms, spectrograms)
#             loss.backward()
            
#             optimizer.step()
#             scheduler.step()

                             
#             # NEED TO IMPLEMENT NEW LOSS AND GRADIENT PROPOGATION
                             
# #             loss = criterion(output, labels, input_lengths, label_lengths)
#             loss.backward()

#             optimizer.step()
#             scheduler.step()
#             if batch_idx % 100 == 0 or batch_idx == data_len:
#                 print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                     epoch, batch_idx * len(spectrograms), data_len,
#                     100. * batch_idx / len(train_loader), loss.item()))
                             
#         # Calculate validation statistics and save after each epoch                 
#         test(_model, _device, test_loader, criterion, epoch)
#         torch.save(_model.state_dict(), _hparams['save_model_path'])

In [7]:
# Define Validation Metrics and Behavior (Currently Broken)
def test(model, device, test_loader, criterion, epoch):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms = _data 
            spectrograms = spectrograms.to(device)

            output = model(spectrograms)  # (batch, time, n_class)
#             output = F.log_softmax(output, dim=2)
#             output = output.transpose(0, 1) # (time, batch, n_class)

            # Print loss information
            
#             loss = criterion(output, labels, input_lengths, label_lengths)
#             test_loss += loss.item() / len(test_loader)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))

In [8]:
# Deprecated, leaving in for possibility of code reuse in development             
def encode(input_file_path):
    global _model, _device, _hparams, _use_cuda

    if _model is None:
        raise Exception("Model was not loaded, call load_model() and truncate_model() before encoding")

    # Currently only one style encoding, generating all speech in same voice
    preprocessed_wav = styleEncoder.preprocess_wav(Path("data/styleAudio/rand1.flac"))
    styleEmbeds = torch.from_numpy(styleEncoder.embed_utterance(preprocessed_wav)).unsqueeze(0).to(_device)
    
    waveform, sample_rate = torchaudio.load(input_file_path, normalization=True)
    input_data = [[waveform, None, None, None, None, None]]
    spectrograms = data_processing(input_data, 'valid').to(_device)
    
    _model.eval()
    output=_model(spectrograms, styleEmbeds)
    
    return output

In [9]:
# Load in the model from a file
# load_models("./contentEncoder/saved_models/deepspeech5.pt", "./contentEncoder/saved_models/deepspeech6.pt", 8)

# Load a fresh model, untrained
load_models(batch_size=1, epochs=1)
train()
# output = encode("./data/contentAudio/40-222-0030.flac")

# Train the model
# train()

VoiceImpersonator(
  (encoder): ContentEncoder(
    (Convolutional_Feature_Extraction): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (Residual_CNN_Blocks): Sequential(
      (0): ResidualCNN(
        (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (layer_norm1): CNNLayerNorm(
          (layer_norm): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
        )
        (layer_norm2): CNNLayerNorm(
          (layer_norm): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
        )
      )
      (1): ResidualCNN(
        (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropou

RuntimeError: size mismatch, m1: [1 x 1792], m2: [1024 x 80] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:290