In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class VoiceEmbedNet(nn.Module):
    def __init__(self, input_channel, channels, output_channel):
        super(VoiceEmbedNet, self).__init__()
        self.model = nn.Sequential(
            nn.Conv1d(input_channel, channels[0], 3, 2, 1, bias=False),
            nn.BatchNorm1d(channels[0], affine=True),
            nn.ReLU(inplace=True),
            nn.Conv1d(channels[0], channels[1], 3, 2, 1, bias=False),
            nn.BatchNorm1d(channels[1], affine=True),
            nn.ReLU(inplace=True),
            nn.Conv1d(channels[1], channels[2], 3, 2, 1, bias=False),
            nn.BatchNorm1d(channels[2], affine=True),
            nn.ReLU(inplace=True),
            nn.Conv1d(channels[2], channels[3], 3, 2, 1, bias=False),
            nn.BatchNorm1d(channels[3], affine=True),
            nn.ReLU(inplace=True),
            nn.Conv1d(channels[3], output_channel, 3, 2, 1, bias=True),
        )

    def forward(self, x):
        x = self.model(x)
        x = nn.functional.avg_pool1d(x, x.size()[2], stride=1)
        x = x.view(x.size()[0], -1, 1, 1)
        return x

In [3]:
class Generator(nn.Module):
    def __init__(self, input_channel, channels, output_channel):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.ConvTranspose2d(input_channel, channels[0], 4, 1, 0, bias=True),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(channels[0], channels[1], 4, 2, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(channels[1], channels[2], 4, 2, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(channels[2], channels[3], 4, 2, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(channels[3], channels[4], 4, 2, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(channels[4], output_channel, 1, 1, 0, bias=True),
            nn.Tanh()
        )
    def forward(self, x):
        x = self.model(x)
        return x

In [4]:
class FaceEmbedNet(nn.Module):
    def __init__(self, input_channel, channels, output_channel):
        super(FaceEmbedNet, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(input_channel, channels[0], 1, 1, 0, bias=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(channels[0], channels[1], 4, 2, 1, bias=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(channels[1], channels[2], 4, 2, 1, bias=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(channels[2], channels[3], 4, 2, 1, bias=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(channels[3], channels[4], 4, 2, 1, bias=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(channels[4], output_channel, 4, 1, 0, bias=True),
        )
    def forward(self, x):
        x = self.model(x)
        return x

In [5]:
class Classifier(nn.Module):
    def __init__(self, input_channel, channels, output_channel):
        super(Classifier, self).__init__()
        self.model = nn.Linear(input_channel, output_channel, bias=False)
    def forward(self, x):
        x = x.view(x.size()[0], -1)
        x = self.model(x)
        return x

In [6]:
def get_network(net_type, params, train=True):
    net_params = params[net_type]
    net = net_params['network'](net_params['input_channel'],
                                net_params['channels'],
                                net_params['output_channel'])
    if train:
        net.train()
        optimizer = optim.Adam(net.parameters(),lr=params['lr'],betas=(params['beta1'], params['beta2']))
    else:
        net.eval()
        optimizer = None
    return net, optimizer

In [7]:
NETWORKS_PARAMETERS = {
    # VOICE EMBEDDING NETWORK (e)
    'e': {
        'network': VoiceEmbedNet,
        'input_channel': 64,
        'channels': [256, 384, 576, 864],
        'output_channel': 64, # the embedding dimension
    },
    # GENERATOR (g)
    'g': {
        'network': Generator,
        'input_channel': 64,
        'channels': [1024, 512, 256, 128, 64], # channels for deconvolutional layers
        'output_channel': 3, # images with RGB channels
    },
    # FACE EMBEDDING NETWORK (f)
    'f': {
        'network': FaceEmbedNet,
        'input_channel': 3,
        'channels': [32, 64, 128, 256, 512],
        'output_channel': 64,
    },
    # DISCRIMINATOR (d)
    'd': {
        'network': Classifier, # Discrminator is a special Classifier with 1 subject
        'input_channel': 64,
        'channels': [],
        'output_channel': 1,
    },
    # CLASSIFIER (c)
    'c': {
        'network': Classifier,
        'input_channel': 64,
        'channels': [],
        'output_channel': 2, # This parameter is depended on the dataset we used
    },
    # OPTIMIZER PARAMETERS 
    'lr': 0.0002,
    'beta1': 0.5,
    'beta2': 0.999
}

In [8]:
e_net, e_optimizer = get_network('e', NETWORKS_PARAMETERS, train=False)
g_net, g_optimizer = get_network('g', NETWORKS_PARAMETERS, train=True)
f_net, f_optimizer = get_network('f', NETWORKS_PARAMETERS, train=True)
d_net, d_optimizer = get_network('d', NETWORKS_PARAMETERS, train=True)
c_net, c_optimizer = get_network('c', NETWORKS_PARAMETERS, train=True)

In [9]:
# initialise first label for real/fake faces (keep the same names)
#real_label ,fake_label 

In [10]:
'''
for i in range(10000):
    # You have to fill first those 4 parameters from your dataset (keep the same names)
    #voice, voice_label , face, face_label
    noise = 0.05*torch.randn(128, 64, 1, 1)

    embeddings = e_net(voice)
    embeddings = nn.functional.normalize(embeddings)
    embeddings = embeddings + noise
    embeddings = nn.functional.normalize(embeddings)
    fake = g_net(embeddings)

    # Discriminator
    f_optimizer.zero_grad()
    d_optimizer.zero_grad()
    c_optimizer.zero_grad()
    real_score_out = d_net(f_net(face))
    fake_score_out = d_net(f_net(fake.detach()))
    real_label_out = c_net(f_net(face))
    D_real_loss = nn.functional.binary_cross_entropy(torch.sigmoid(real_score_out), real_label)
    D_fake_loss = nn.functional.binary_cross_entropy(torch.sigmoid(fake_score_out), fake_label)
    C_real_loss = nn.functional.nll_loss(nn.functional.log_softmax(real_label_out, 1), face_label)
    D_real.update(D_real_loss.item())
    D_fake.update(D_fake_loss.item())
    C_real.update(C_real_loss.item())
    (D_real_loss + D_fake_loss + C_real_loss).backward()
    f_optimizer.step()
    d_optimizer.step()
    c_optimizer.step()

    # Generator
    g_optimizer.zero_grad()
    fake_score_out = d_net(f_net(fake))
    fake_label_out = c_net(f_net(fake))
    GD_fake_loss = nn.functional.binary_cross_entropy(torch.sigmoid(fake_score_out), real_label)
    GC_fake_loss = nn.functional.nll_loss(nn.functional.log_softmax(fake_label_out, 1), voice_label)
    (GD_fake_loss + GC_fake_loss).backward()
    GD_fake.update(GD_fake_loss.item())
    GC_fake.update(GC_fake_loss.item())
    g_optimizer.step()
    '''

'\nfor i in range(10000):\n    # You have to fill first those 4 parameters from your dataset (keep the same names)\n    #voice, voice_label , face, face_label\n    noise = 0.05*torch.randn(128, 64, 1, 1)\n\n    embeddings = e_net(voice)\n    embeddings = nn.functional.normalize(embeddings)\n    embeddings = embeddings + noise\n    embeddings = nn.functional.normalize(embeddings)\n    fake = g_net(embeddings)\n\n    # Discriminator\n    f_optimizer.zero_grad()\n    d_optimizer.zero_grad()\n    c_optimizer.zero_grad()\n    real_score_out = d_net(f_net(face))\n    fake_score_out = d_net(f_net(fake.detach()))\n    real_label_out = c_net(f_net(face))\n    D_real_loss = nn.functional.binary_cross_entropy(torch.sigmoid(real_score_out), real_label)\n    D_fake_loss = nn.functional.binary_cross_entropy(torch.sigmoid(fake_score_out), fake_label)\n    C_real_loss = nn.functional.nll_loss(nn.functional.log_softmax(real_label_out, 1), face_label)\n    D_real.update(D_real_loss.item())\n    D_fake.u