In [1]:
import sys
sys.path.append("../")

In [2]:
import matplotlib.pyplot as plt
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import IPython.display as ipd

from mel2wav.dataset import AudioDataset
from mel2wav.modules import Generator, Discriminator, Audio2Mel

In [3]:
ngf = 32
n_residual_layers = 3
num_D = 3
ndf = 16 
n_layers_D = 4
downsamp_factor = 4
n_mel_channels = 80

netG = Generator(n_mel_channels, ngf, n_residual_layers).cuda()
netD = Discriminator(num_D, ndf, n_layers_D, downsamp_factor).cuda()
fft = Audio2Mel(n_mel_channels=n_mel_channels).cuda()

optG = torch.optim.Adam(netG.parameters(), lr=1e-4, betas=(0.5, 0.9))
optD = torch.optim.Adam(netD.parameters(), lr=1e-4, betas=(0.5, 0.9))

In [4]:
seq_len = 8192 * 12
data_path = '..\data\jazz_classical'

train_set = AudioDataset(
        Path(data_path) / "train_files.txt", seq_len, sampling_rate=22050)
style_set = AudioDataset(
        Path(data_path) / "style_files.txt", seq_len, sampling_rate=22050)

In [5]:
load_root = Path('../data/test_script_2')

if load_root and load_root.exists():
    netG.load_state_dict(torch.load(load_root / "netG.pt"))
    optG.load_state_dict(torch.load(load_root / "optG.pt"))
    netD.load_state_dict(torch.load(load_root / "netD.pt"))
    optD.load_state_dict(torch.load(load_root / "optD.pt"))
    print('weights successfully loaded ...')

weights successfully loaded ...


In [6]:
class ContentLoss(nn.Module):

    def __init__(self, target, margin=2.0):
        super(ContentLoss, self).__init__()
        # we 'detach' the target content from the tree used
        # to dynamically compute the gradient: this is a stated value,
        # not a variable. Otherwise the forward method of the criterion
        # will throw an error.
        self.target = target
        self.margin = margin

    def forward(self, input):
        dist = self.target - input 
        squarred_distance_1 = torch.pow(torch.clamp(self.margin - dist, min=0.0), 2)
        self.loss = 1/2 * squarred_distance_1.mean()
        return input

In [7]:
class StyleLoss(nn.Module):

    def __init__(self, target, margin=2.0):
        super(StyleLoss, self).__init__()
        # we 'detach' the target content from the tree used
        # to dynamically compute the gradient: this is a stated value,
        # not a variable. Otherwise the forward method of the criterion
        # will throw an error.
        self.target = target
        self.margin = margin

    def forward(self, input):
        dist = self.target - input 
        squarred_distance_1 = (dist).pow(2).sum(1)
        self.loss = 1/2 * squarred_distance_1.mean()
        return input

In [8]:
content_layers_default = [3,5,6]
style_layers_default = [3,5,6] # list(range(nb_layers))

def get_style_model_and_losses(net_ensamble, style_song, content_song,
                               content_layers=content_layers_default,
                               style_layers=style_layers_default,
                               pref_disc='disc_0'):
    
    # just in order to have an iterable access to or list of content/syle
    # losses
    content_losses = []
    style_losses = []
    
    model = nn.Sequential()
    source_net = netD.model[pref_disc]
    
    for idx, [name, layer] in enumerate(source_net.model.items()):
        if isinstance(layer, nn.Sequential):
            for i, x in enumerate(layer):
                model.add_module(name + f'_{i}', x)
                    
        if idx in content_layers:
            target = model(content_song)
            content_loss = ContentLoss(target)
            model.add_module("content_loss_{}".format(idx), content_loss) 
            content_losses.append(content_loss)
            
        if idx in style_layers:
            target = model(style_song)
            style_loss = StyleLoss(target)
            model.add_module("style_loss_{}".format(idx), style_loss)
            style_losses.append(style_loss)
            
    return model, style_losses, content_losses

In [9]:
def get_input_optimizer(input_img):
    # this line to show that input is a parameter that requires a gradient
    optimizer = optim.LBFGS([input_img.requires_grad_()])
    return optimizer

In [10]:
def run_style_transfer(netD, style_song, content_song, input_song, num_steps=25, pref_disc='disc_2',
                       style_weight=10, content_weight=1):
    """Run the style transfer."""
    print('Building the style transfer model..')
    model, style_losses, content_losses = get_style_model_and_losses(netD, style_song, content_song, pref_disc=pref_disc)
    optimizer = get_input_optimizer(input_song)
    
    print('Optimizing..')
    run = [0]
    while run[0] <= num_steps:

        def closure():
            # correct the values of updated input image
            optimizer.zero_grad()
            model(input_song)
            style_score = 0
            content_score = 0

            for sl in style_losses:
                style_score += sl.loss
                
            for cl in content_losses:
                content_score += cl.loss
            
            style_score *= style_weight
            content_score *= content_weight

            loss = style_score + content_score
            loss.backward(retain_graph=True)

            run[0] += 1
            if run[0] % 10 == 0:
                print("run {}:".format(run))
                print('Style Loss : {:4f} Content Loss: {:4f}'.format(
                    style_score.item(), content_score.item()))
                print()

            return style_score + content_score

        optimizer.step(closure)

    return input_song

In [11]:
import copy

cont_idx = np.random.choice(len(train_set)) 
styl_idx = np.random.choice(len(style_set)) 

content = train_set[cont_idx]
style   = train_set[styl_idx]
input_s = copy.deepcopy(content)

content_song = content.unsqueeze(dim=0).cuda()
style_song = style.unsqueeze(dim=0).cuda()
input_song = input_s.unsqueeze(dim=0).cuda()

output = run_style_transfer(netD, style_song, content_song, input_song, num_steps=100)

Building the style transfer model..
Optimizing..
run [10]:
Style Loss : 9026.218750 Content Loss: 7.714941

run [20]:
Style Loss : 4433.874023 Content Loss: 8.070089

run [30]:
Style Loss : 2629.255371 Content Loss: 8.254086

run [40]:
Style Loss : 1743.012573 Content Loss: 8.340436

run [50]:
Style Loss : 1300.758057 Content Loss: 8.378820

run [60]:
Style Loss : 1023.976379 Content Loss: 8.418961

run [70]:
Style Loss : 837.364807 Content Loss: 8.440334

run [80]:
Style Loss : 696.370361 Content Loss: 8.445821

run [90]:
Style Loss : 601.944824 Content Loss: 8.454582

run [100]:
Style Loss : 530.985962 Content Loss: 8.463808

run [110]:
Style Loss : 476.934662 Content Loss: 8.470301

run [120]:
Style Loss : 433.657959 Content Loss: 8.480482



In [12]:
style  = style.numpy()[0]
content  = content.numpy()[0]
combined = output.cpu().detach().numpy()[0][0]

In [13]:
ipd.Audio(content, rate=22050)

In [14]:
ipd.Audio(style, rate=22050)

In [15]:
ipd.Audio(combined, rate=22050)