In [1]:
import argparse
import functools
from copy import deepcopy
from typing import Any, Tuple, Optional, List, Callable
from pathlib import Path
import numpy as np
from PIL import Image
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import os
import re
import warnings
from torchvision import transforms
from tqdm.auto import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import torch.utils.data as data

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def preprocess(videos: tf.Tensor, target_resolution: Tuple[int, int]) -> Any:
    """
    Run some preprocessing on the videos for I3D model.

    :param videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
      preprocessed. We don't care about the specific dtype of the videos, it can
      be anything that tf.image.resize_bilinear accepts. Values are expected to
      be in the range 0-255.
    :param target_resolution: (width, height): target video resolution
    :return: videos: <float32>[batch_size, num_frames, height, width, depth]
    """

    videos_shape = videos.shape.as_list()
    all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])
    resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)
    target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]
    output_videos = tf.reshape(resized_videos, target_shape)
    scaled_videos = 2.0 * tf.cast(output_videos, tf.float32) / 255.0 - 1
    return scaled_videos

In [3]:
def _is_in_graph(tensor_name: tf.Tensor) -> bool:
    """
    Check whether a given tensor does exists in the graph.
    """
    try:
        tf.get_default_graph().get_tensor_by_name(tensor_name)
    except KeyError:
        return False
    return True


In [4]:
def to_numpy(tensor: torch.Tensor) -> np.ndarray:
    """
    Convert video to numpy.
    """
    generated = tensor.data.cpu().numpy()
    generated[generated < -1] = -1
    generated[generated > 1] = 1
    generated = (generated + 1) / 2 * 255
    return generated.astype("uint8")

In [5]:
def create_id3_embedding(videos: tf.Tensor, batch_size: int) -> tf.Tensor:
    """
    Embed the given videos using the Inflated 3D Convolution network.

    Downloads the graph of the I3D from tf.hub and adds it to the graph on the
    first call.

    :param videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3]. Expected range is [-1, 1].
    :param batch_size: batch size
    :return: <float32>[batch_size, embedding_size]. embedding_size depends on the model used.
    :raises ValueError: when a provided embedding_layer is not supported.
    """

    module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"

    # Making sure that we import the graph separately for
    # each different input video tensor.
    module_name = "fvd_kinetics-400_id3_module_" + videos.name.replace(":", "_")

    assert_ops = [
        tf.Assert(tf.reduce_max(videos) <= 1.001, ["max value in frame is > 1", videos]),
        tf.Assert(tf.reduce_min(videos) >= -1.001, ["min value in frame is < -1", videos]),
        tf.assert_equal(tf.shape(videos)[0], batch_size, ["invalid frame batch size: ", tf.shape(videos)], summarize=6),
    ]
    with tf.control_dependencies(assert_ops):
        videos = tf.identity(videos)

    module_scope = "%s_apply_default/" % module_name

    # To check whether the module has already been loaded into the graph, we look
    # for a given tensor name. If this tensor name exists, we assume the function
    # has been called before and the graph was imported. Otherwise we import it.
    # Note: in theory, the tensor could exist, but have wrong shapes.
    # This will happen if create_id3_embedding is called with a frames_placehoder
    # of wrong size/batch size, because even though that will throw a tf.Assert
    # on graph-execution time, it will insert the tensor (with wrong shape) into
    # the graph. This is why we need the following assert.
    video_batch_size = int(videos.shape[0])
    assert video_batch_size in [batch_size, -1, None], "Invalid batch size"
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    if not _is_in_graph(tensor_name):
        i3d_model = hub.Module(module_spec, name=module_name)
        i3d_model(videos)

    # gets the kinetics-i3d-400-logits layer
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
    return tensor


In [6]:
def calculate_fvd(real_activations: tf.Tensor, generated_activations: tf.Tensor) -> tf.Tensor:
    """
    Return a list of ops that compute metrics as funcs of activations.

    :param real_activations: <float32>[num_samples, embedding_size]
    :param generated_activations: <float32>[num_samples, embedding_size]
    :return: FVD score
    """
    return tf.contrib.gan.eval.frechet_classifier_distance_from_activations(real_activations, generated_activations)

In [7]:
class StackDataset(data.Dataset):
    def __init__(self, root_path, img_size, nframes):
        self.root_path = root_path
        self.image_size = img_size
        self.nframes = nframes
        
        self._transform = transforms.Compose(
            [
                transforms.Resize(self.image_size, interpolation=Image.CUBIC),
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
            ]
        )

        self.data = list(root_path.iterdir())

    def __getitem__(self, idx):
        path = self.data[idx]
        film_img = Image.open(path)
        film = []
        for i in range(self.nframes):
            img = film_img.crop((img_size * i, 0, img_size * (i + 1), img_size))
            film.append(img)

        film = [self._transform(img) for img in film]
        film = torch.stack(film).permute(1, 0, 2, 3)
        return film

    def __len__(self):
        return len(self.data)

In [8]:
def split_data(root_path: Path, output_path: Path, rows, columns, padding, img_size) -> Tuple:
    for path in root_path.iterdir():
        image = Image.open(path)
        for r in range(rows):
            images = []
            for c in range(columns):
                p = padding
                size_col = p + (img_size + p) * c
                size_row = p + (img_size + p) * r
                img = image.crop((size_col, size_row, size_col + img_size, size_row + img_size))
                images.append(np.asanyarray(img))
            film = Image.fromarray(np.hstack(images))

            film_output_path = output_path / f"{path.stem}_{r}{path.suffix}"
            film_output_path.parent.mkdir(parents=True, exist_ok=True)
            film.save(film_output_path)

In [9]:
fakes_path = Path("results/clipping_frame24_iterD2_alphaSimil0_seed7879_220608-085346/fakes")
reals_path = Path("results/clipping_frame24_iterD2_alphaSimil0_seed7879_220608-085346/reals")

In [10]:
padding = 2
rows = 3
columns = 24
img_size = 256
frame = 5
bs = 1
bins = 200
video_length = 24

In [90]:
split_data(fakes_path, fakes_path.parent / "fakes_split", rows, columns, padding, img_size)
split_data(reals_path, reals_path.parent / "reals_split", rows, columns, padding, img_size)

In [11]:
fakesds = StackDataset(fakes_path, img_size, video_length)
realsds = StackDataset(reals_path, img_size, video_length)

In [12]:
class RealBatchSampler:
    """
    Wrapper for endless batch sampling.
    """

    def __init__(self, sampler: Any) -> None:
        self._batch_size: int = sampler.batch_size
        self._sampler = sampler
        self._enumerator: Optional[Any] = None

    def __iter__(self) -> Any:
        return self

    def __next__(self) -> Tuple:
        """
        Sample provider call.
        """
        if self._enumerator is None:
            self._enumerator = enumerate(self._sampler)

        batch_idx, batch = next(self._enumerator)

        if batch_idx == len(self._sampler) - 1:
            self._enumerator = enumerate(self._sampler)

        return batch

    @property
    def batch_size(self) -> int:
        """
        Get batch size.
        """
        return self._batch_size

    def __len__(self) -> int:
        """
        Get length.
        """
        return len(self._sampler)

In [13]:
class Loader:
    def __init__(self, dataset, bs, num_workers):
        self.dataset = dataset
        self.bs = bs
        self.num_workers = num_workers

    def __call__(self):
        sample_provider = torch.utils.data.DataLoader(
            self.dataset,
            batch_size=self.bs,
            shuffle=True,
            num_workers=self.num_workers,
            drop_last=True,
        )

        return RealBatchSampler(sample_provider)

In [14]:
x = tf.placeholder(tf.float16, shape=(1, video_length, img_size, img_size, 3))

In [15]:
item1 = create_id3_embedding(preprocess(x, (224, 224)), batch_size=1)

Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [16]:
fakes_loader = Loader(fakesds, bs, 4)()
reals_loader = Loader(realsds, bs, 4)()

In [22]:
real_embeds, fake_embeds = [], []

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.5

sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

for _ in tqdm(range(len(fakesds)), total=len(fakesds)):
    fake_tuple, real_tuple = next(iter(fakes_loader)), next(iter(reals_loader))

    fake_videos = to_numpy(fake_tuple[0])[None, :].transpose(0, 2, 3, 4, 1)
    real_videos = to_numpy(real_tuple[0])[None, :].transpose(0, 2, 3, 4, 1)

    lol1 = sess.run(item1, feed_dict={x: fake_videos})
    lol2 = sess.run(item1, feed_dict={x: real_videos})

    fake_embeds.append(lol1)
    real_embeds.append(lol2)

fake_videos = np.concatenate(fake_embeds)
real_videos = np.concatenate(real_embeds)
print(fake_videos.shape)
fake_videos = tf.convert_to_tensor(fake_videos, np.float32)
real_videos = tf.convert_to_tensor(real_videos, np.float32)
result = calculate_fvd(fake_videos, real_videos).eval(session=sess)
print(result)

  0%|          | 0/30 [00:00<?, ?it/s]

(30, 400)

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.cast instead.
2106.1926
