In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from encoder import inference as styleEncoder
from vocoder import inference as vocoder

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
_model = None # type: VoiceImpersonator
_hparams = None
_device = None
_use_cuda = None
_train_audio_transforms = None
_valid_audio_transforms = None

In [3]:
def load_models(model_path=None, save_model_path=None, batch_size=3, epochs=1):
    global _model, _device, _hparams, _use_cuda
    _use_cuda = torch.cuda.is_available()
    _device = torch.device("cuda" if _use_cuda else "cpu")
    warm_start=True

    _hparams = {
        "n_cnn_layers": 2,
        "n_rnn_layers": 4,
        "rnn_dim": 256,
#         "n_class": 29,
#         "n_feats": 64,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": 5e-4,
        "batch_size": batch_size,
        "epochs": epochs,
        "save_model_path": save_model_path
    }

    _model = VoiceImpersonator(
    _hparams['n_cnn_layers'], _hparams['n_rnn_layers'], _hparams['rnn_dim'],
    _hparams['stride'], _hparams['dropout']
    ).to(_device)

    if model_path is not None:
        _model.load_state_dict(torch.load(model_path), strict=False)

    print(_model)
    print('Num Model Parameters', sum([param.nelement() for param in _model.parameters()]))
    load_transforms()
    
#     vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))
    styleEncoder.load_model(Path("encoder/saved_models/pretrained.pt"))

In [4]:
# Define all components of encoder network
class ContentEncoder(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, stride=2, dropout=0.1):
        super(ContentEncoder, self).__init__()
        
        # Single convolutional layer for basic heirarchal feature extraction
        # Conv2D(in_channels, out_channels, kernel_size, stride, padding, dilation....)
        self.Convolutional_Feature_Extraction = nn.Conv2d(1, 32, 3, stride=stride, padding=1)

        # n_cnn Residual Convolutional Layers for deeper feature extraction
        self.Residual_CNN_Blocks = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=64)
            for _ in range(n_cnn_layers)
        ])
                             
        # Single fully connected layer, 2048 inputs (64 features * 32 filters), rnn_dim outputs
        # Somewhat misleading, I believe this outputs a batch of matrices
        self.Feature_Downsampling = nn.Linear(2048, rnn_dim)
        
        self.Recurrent_Block = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])

    def forward(self, x):
        # Input is a mel spectrogram
        # Input is of shape (batch, channels=1, mel_features=128, timesteps)
        x = self.Convolutional_Feature_Extraction(x)
        x = self.Residual_CNN_Blocks(x)
        
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        
        x = self.Feature_Downsampling(x)
        x = self.Recurrent_Block(x)
        
        return x


class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 


class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)


class BidirectionalGRU(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)   
        x = self.dropout(x)
        return x

In [5]:
# Define componenets of Synthesizer network
class Synthesizer(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, stride=2, dropout=0.1):
        super(Synthesizer, self).__init__()
        
        self.LSTM = nn.LSTM(input_size=768, hidden_size=1024, num_layers=1)
        
        self.Linear_Projection = nn.Linear(in_features=1024, out_features=80)
        
        self.PostNet = PostNet(in_channels=1, out_channels=256, kernel=(1,5),
                               stride=1, padding=(0,2), dropout=0.1)

    def forward(self, x):
        # Input is a batch of feature matrices from encoder network concatenated w/ style embeddings
        # Input is of shape (batch, "timesteps", features=768)
        x, _ = self.LSTM(x)
        x = self.Linear_Projection(x)
        x = x.unsqueeze(1)
        
        x = self.PostNet(x)
        print("After PostNet")
        print(x.size())
        
        return x

class PreNet(nn.Module):
    def __init__(self, input_dim, output_dim, dropout):
        super(PreNet, self).__init__()

        self.fully_connected1 = nn.Linear(input_dim, 256)
        self.fully_connected2 = nn.Linear(256, 256)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fully_connected1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.fully_connected2(x)
        x = F.relu(x)
        x = self.dropout2(x)
        return x
    
class PostNet(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, padding, dropout):
        super(PostNet, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding)
        self.cnn3 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding)
        self.cnn4 = nn.Conv2d(out_channels, 1, kernel, stride, padding)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.cnn1(x)
        x = F.tanh(x)
        x = self.dropout1(x)
        x = self.cnn2(x)
        x = F.tanh(x)
        x = self.dropout2(x)
        x = self.cnn3(x)
        x = F.tanh(x)
        x = self.dropout3(x)
        x = self.cnn4(x)

        x += residual
        return x # (batch, channel, feature, time)

In [6]:
# Merge encoder and synthesizer into full network
class VoiceImpersonator(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, stride=2, dropout=0.1):
        super().__init__()
        
        self.encoder=ContentEncoder(
            _hparams['n_cnn_layers'], _hparams['n_rnn_layers'], _hparams['rnn_dim'],
            _hparams['stride'], _hparams['dropout']
            )
        
        self.synthesizer=Synthesizer(
            _hparams['n_cnn_layers'], _hparams['n_rnn_layers'], _hparams['rnn_dim'],
            _hparams['stride'], _hparams['dropout']
        )
        
    def forward(self, x):
        # Input is a 2 element list, containing a batch of spectrograms and a batch of style embeddings
        # Extract style embeddings from input tensor, remove from list
        embeds = x[1]
        x = x[0]
        
        print("Input Spectrograms Shape")
        print(x.shape)
        
        print("Style Embeddings Shape")
        print(embeds.shape)
        
        # Compute content encoding
        x = self.encoder(x)
        print("Content Encodings Shape")
        print(x.shape)
        
        styleEmbeds = torch.empty((_hparams['batch_size'], x.shape[1], 256)).to(_device)
                                            
        for i in range(0, _hparams['batch_size']-1):
            styleEmbeds[i,:]=embeds[i,:].repeat(1,x.shape[1],1)
                                            
        print(styleEmbeds.shape)

        # Concatenate content with style embedding, now size (batch, timesteps, 768)
        x = torch.cat((x, styleEmbeds), 2)
        print("Concatenated embedding shape")
        print(x.shape)
        
        # Synthesize a spectrogram from the combined embeddings
        x = self.synthesizer(x)
        
        return x

In [7]:
# Manage data preprocessing (creating spectrograms, transforms, etc)
def load_transforms():
    global _train_audio_transforms, _valid_audio_transforms
    _train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
    )

    _valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)
    
def data_processing(data, data_type="train"):
    global _train_audio_transforms, _valid_audio_transforms
    spectrograms = []
#     styles = []

    # waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id
    for (waveform, sample_rate, _, _, _, _) in data:
        if data_type == 'train':
            spec = _train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = _valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
#         preprocessed_wav = styleEncoder.preprocess_wav(waveform, sample_rate)
#         styles.append(torch.from_numpy(styleEncoder.embed_utterance(preprocessed_wav)))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)

    
    return spectrograms#, styles

In [8]:
# Define Training Behavior       
def train():
    global _model, _device, _hparams, _use_cuda

    if _model is None:
        raise Exception("Model was not loaded, call load_model() before training")

    dataset_dir = os.path.expanduser("~/dev/datasets/Libri")
    if not os.path.isdir(dataset_dir):
        print("Running")
        os.makedirs(dataset_dir)

    train_url="train-clean-100"
    test_url="test-clean"


    train_dataset = torchaudio.datasets.LIBRISPEECH(dataset_dir, url=train_url, download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH(dataset_dir, url=test_url, download=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if _use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=_hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=_hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)


    optimizer = optim.AdamW(_model.parameters(), _hparams['learning_rate'])
#     criterion = nn.CTCLoss(blank=28).to(_device)
            
                             
                             
    # NEED A NEW LOSS
                             
                             
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=_hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=_hparams['epochs'],
                                            anneal_strategy='linear')
          
        
        
    # Currently only one style encoding, generating all speech in same voice
    preprocessed_wav = styleEncoder.preprocess_wav(Path("data/styleAudio/rand1.flac"))
    styleEmbed = torch.from_numpy(styleEncoder.embed_utterance(preprocessed_wav))
    styleEmbed = styleEmbed.repeat(_hparams['batch_size'],1).to(_device)
    
    _model.train()
    data_len = len(train_loader.dataset)
    for epoch in range(1, _hparams['epochs'] + 1):
        for batch_idx, _data in enumerate(train_loader):
            spectrograms = _data
            spectrograms = spectrograms.to(_device)
            x = [spectrograms, styleEmbed]
            optimizer.zero_grad()

            output = _model(x)  # (batch, timesteps, frequencies)
#             print(output.shape)
#             output = F.log_softmax(output, dim=2)
#             output = output.transpose(0, 1) # (time, batch, n_class)

                             
#             # NEED TO IMPLEMENT NEW LOSS AND GRADIENT PROPOGATION
                             
# #             loss = criterion(output, labels, input_lengths, label_lengths)
#             loss.backward()

#             optimizer.step()
#             scheduler.step()
#             if batch_idx % 100 == 0 or batch_idx == data_len:
#                 print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                     epoch, batch_idx * len(spectrograms), data_len,
#                     100. * batch_idx / len(train_loader), loss.item()))
                             
#         # Calculate validation statistics and save after each epoch                 
#         test(_model, _device, test_loader, criterion, epoch)
#         torch.save(_model.state_dict(), _hparams['save_model_path'])

In [9]:
# Define Validation Metrics and Behavior
def test(model, device, test_loader, criterion, epoch):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms = _data 
            spectrograms = spectrograms.to(device)

            output = model(spectrograms)  # (batch, time, n_class)
#             output = F.log_softmax(output, dim=2)
#             output = output.transpose(0, 1) # (time, batch, n_class)

            # Print loss information
            
#             loss = criterion(output, labels, input_lengths, label_lengths)
#             test_loss += loss.item() / len(test_loader)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))

In [10]:
# Deprecated, leaving in for possibility of code reuse in development             
def encode(input_file_path):
    global _model, _device, _hparams, _use_cuda

    if _model is None:
        raise Exception("Model was not loaded, call load_model() and truncate_model() before encoding")

    # Currently only one style encoding, generating all speech in same voice
    preprocessed_wav = styleEncoder.preprocess_wav(Path("data/styleAudio/rand1.flac"))
    styleEmbeds = torch.from_numpy(styleEncoder.embed_utterance(preprocessed_wav)).unsqueeze(0).to(_device)
    
    waveform, sample_rate = torchaudio.load(input_file_path, normalization=True)
    input_data = [[waveform, None, None, None, None, None]]
    spectrograms = data_processing(input_data, 'valid').to(_device)
    x = [spectrograms, styleEmbeds]
    
    _model.eval()
    output=_model(x)
    
    return output

In [11]:
# Load in the model from a file
# load_models("./contentEncoder/saved_models/deepspeech5.pt", "./contentEncoder/saved_models/deepspeech6.pt", 8)

# Load a fresh model, untrained
load_models()
train()
# output = encode("./data/contentAudio/40-222-0030.flac")

# Train the model
# train()

VoiceImpersonator(
  (encoder): ContentEncoder(
    (Convolutional_Feature_Extraction): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (Residual_CNN_Blocks): Sequential(
      (0): ResidualCNN(
        (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (layer_norm1): CNNLayerNorm(
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (layer_norm2): CNNLayerNorm(
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
      )
      (1): ResidualCNN(
        (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropou



Input Spectrograms Shape
torch.Size([3, 1, 128, 1213])
Style Embeddings Shape
torch.Size([3, 256])
Content Encodings Shape
torch.Size([3, 607, 512])
torch.Size([3, 607, 256])
Concatenated embedding shape
torch.Size([3, 607, 768])


RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 3.95 GiB total capacity; 2.09 GiB already allocated; 89.06 MiB free; 2.14 GiB reserved in total by PyTorch)

In [38]:
# Currently only one style encoding, generating all speech in same voice
preprocessed_wav = styleEncoder.preprocess_wav(Path("data/styleAudio/rand1.flac"))
styleEmbed = torch.from_numpy(styleEncoder.embed_utterance(preprocessed_wav))
styleEmbed = styleEmbed.repeat(_hparams['batch_size'],1).to(_device)
print(styleEmbed.shape)

torch.Size([8, 256])


In [41]:
tens = torch.empty((8, 600, 256))
tens[1,:]=styleEmbed[1,:].repeat(1,600,1)
print(tens.shape)

torch.Size([8, 600, 256])


In [26]:
print(torch.tensor((8,600,256)).shape)

torch.Size([3])
