In [15]:
import torch
import torch.nn.functional as F

In [16]:
import string
from dataset import VoiceDataset, FaceDataset
from network import VoiceEmbedNet, Generator, FaceEmbedNet, Classifier,get_network
from utils import get_collate_fn

DATASET_PARAMETERS = {
    # meta data provided by voxceleb1 dataset
    'meta_file': 'voxCeleb-Dataset\\vox1_meta.csv',

    # voice dataset
    'voice_dir': 'voxCeleb-Dataset\\fbank\\fbank',
    'voice_ext': 'npy',

    # face dataset
    'face_dir': 'voxCeleb-Dataset\\VGG_ALL_FRONTAL\\VGG_ALL_FRONTAL',
    'face_ext': '.jpg',

    # train data includes the identities
    # whose names start with the characters of 'FGH...XYZ'
    'split': string.ascii_uppercase[5:],

    # dataloader
    'voice_dataset': VoiceDataset,
    'face_dataset': FaceDataset,
    'batch_size': 128,
    'nframe_range': [300, 800],
    'workers_num': 1,
    'collate_fn': get_collate_fn,

    # test data
    'test_data': 'data\\example_data'
}


NETWORKS_PARAMETERS = {
    # VOICE EMBEDDING NETWORK (e)
    'e': {
        'network': VoiceEmbedNet,
        'input_channel': 64,
        'channels': [256, 384, 576, 864],
        'output_channel': 64,  # the embedding dimension
        'model_path': 'pretrained_models/voice_embedding.pth',
    },
    # GENERATOR (g)
    'g': {
        'network': Generator,
        'input_channel': 64,
        # channels for deconvolutional layers
        'channels': [1024, 512, 256, 128, 64],
        'output_channel': 3,  # images with RGB channels
        'model_path': 'models/generator.pth',
    },
    # FACE EMBEDDING NETWORK (f)
    'f': {
        'network': FaceEmbedNet,
        'input_channel': 3,
        'channels': [32, 64, 128, 256, 512],
        'output_channel': 64,
        'model_path': 'models/face_embedding.pth',
    },
    # DISCRIMINATOR (d)
    'd': {
        'network': Classifier,  # Discrminator is a special Classifier with 1 subject
        'input_channel': 64,
        'channels': [],
        'output_channel': 1,
        'model_path': 'models/discriminator.pth',
    },
    # CLASSIFIER (c)
    'c': {
        'network': Classifier,
        'input_channel': 64,
        'channels': [],
        'output_channel': -1,  # This parameter is depended on the dataset we used
        'model_path': 'models/classifier.pth',
    },
    # OPTIMIZER PARAMETERS
    'lr': 0.0002,
    'beta1': 0.5,
    'beta2': 0.999,

    # MODE, use GPU or not
    'GPU': True


}


In [17]:
f_net, _ = get_network('f', NETWORKS_PARAMETERS, train=False) #face embed

In [18]:
from PIL import Image
import torch
from torchvision import transforms
import numpy as np

def imread(filename):
    """
    Loads an image file into a (height, width, 3) uint8 ndarray.
    """
    image = Image.open(filename)
    img = image.resize((64, 64), Image.ANTIALIAS)
    return np.asarray(img, dtype=np.uint8)[..., :3]

In [19]:
import glob
fake_images = glob.glob("data\\fake_images\\*")
real_images = glob.glob("data\\real_images\\*")
paper_images = glob.glob("data\\paper_images\\*")

our_model_list = []
paper_model_list = []
for i in range(10):
    our_model_list.append([fake_images[i],real_images[i]])
    paper_model_list.append([paper_images[i],real_images[i]])

# Cosine similarity 

In [22]:
cos_sim_list_ours = []
convert_tensor = transforms.ToTensor()
for image in our_model_list:
    #print(image)
    imgf = imread(image[0])
    imgf = convert_tensor(imgf)
    imgf = imgf.cuda()
    f_net_res_f = f_net.forward(imgf)
    
    imgr = imread(image[1])
    imgr = convert_tensor(imgr)
    imgr = imgr.cuda()
    f_net_res_r = f_net.forward(imgr)
    
    cosine_similarity_value = F.cosine_similarity(f_net_res_f, f_net_res_r, dim=0)
    cos_sim_list_ours.append(cosine_similarity_value.item())
    print(cosine_similarity_value)

  img = image.resize((64, 64), Image.ANTIALIAS)


tensor([[0.9985]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9527]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9760]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9735]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9949]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9952]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9967]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9069]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9818]], device='cuda:0', grad_fn=<SumBackward1>)
tensor([[0.9849]], device='cuda:0', grad_fn=<SumBackward1>)


In [27]:
cos_sim_score_ours = sum(cos_sim_list_ours)/len(cos_sim_list_ours)
cos_sim_score_ours

0.9761002659797668

# L1

In [29]:
import torch.nn as nn

In [30]:
l1 = []
convert_tensor = transforms.ToTensor()
for image in our_model_list:
    #print(image)
    imgf = imread(image[0])
    imgf = convert_tensor(imgf)
    imgf = imgf.cuda()
    f_net_res_f = f_net.forward(imgf)
    
    imgr = imread(image[1])
    imgr = convert_tensor(imgr)
    imgr = imgr.cuda()
    f_net_res_r = f_net.forward(imgr)
    
    loss = nn.L1Loss()
    output = loss(f_net_res_f, f_net_res_r)
    l1.append(output.item())
    print(l1)

  img = image.resize((64, 64), Image.ANTIALIAS)


[0.1645336151123047]
[0.1645336151123047, 0.9962781667709351]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.25248992443084717]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.25248992443084717, 0.3938966989517212]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.25248992443084717, 0.3938966989517212, 0.4451752007007599]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.25248992443084717, 0.3938966989517212, 0.4451752007007599, 1.1131722927093506]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.25248992443084717, 0.3938966989517212, 0.4451752007007599, 1.1131722927093506, 0.457733690738678]
[0.1645336151123047, 0.9962781667709351, 0.6004239320755005, 0.5801464915275574, 0.2524

In [31]:
l1_avg = sum(l1)/len(l1)
l1_avg

0.5632690817117691