This notebook will run a variety of different models on the MSCOCO dataset and test for FID score and Inception score. 

The models evaluated will be miniGlide-sbucaptions, miniGLIDE-dogs, and miniGLIDE-sbucaptions-simple

In [1]:
!pip install pytorch-fid



Let's get the mscoco dataset and save the images

In [2]:
import os
import json
import torch as th
import torchvision.transforms as transforms
from PIL import Image, ImageEnhance
from torchvision.transforms.functional import to_pil_image
from torch.utils.data import Dataset, DataLoader

from glide_text2im.clip.model_creation import create_clip_model
from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (
    create_model_and_diffusion,
    model_and_diffusion_defaults,
    model_and_diffusion_defaults_upsampler,
)
from glide_text2im.tokenizer.simple_tokenizer import SimpleTokenizer

from diffusionHelp import *
from modelParameters import *

from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')

  warn(f"Failed to load image Python extension: {e}")


In [3]:
annFile = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/annotations/captions_val2017.json'
#Load the coco object
coco = COCO(annFile)
annIds = coco.getAnnIds([139])
anns = coco.loadAnns(annIds)
print(anns)

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
[{'image_id': 139, 'id': 372891, 'caption': 'A woman stands in the dining area at the table.'}, {'image_id': 139, 'id': 376968, 'caption': 'A room with chairs, a table, and a woman in it.'}, {'image_id': 139, 'id': 379917, 'caption': 'A woman standing in a kitchen by a window'}, {'image_id': 139, 'id': 382074, 'caption': 'A person standing at a table in a room.'}, {'image_id': 139, 'id': 384831, 'caption': 'A living area with a television and a table'}]


In [4]:
# Let's create a pytorch dataset class for the MSCOCO dataset, we should return the caption based on the image id

class MSCOCODataset(Dataset):
    def __init__(self, image_dir, annotation_file, transform=None):
        self.transform = transform
        # Of the format {image_id: [image_path, caption]}
        self.image_ids_to_path = {}
        self.image_ids = []

        coco = COCO(annotation_file)

        for file in os.listdir(image_dir):
            if file.endswith('.jpg'):
                image_id = int(file.split('.')[0])
                self.image_ids.append(image_id)
                # Load the annotations and use the first given caption
                annIds = coco.getAnnIds([image_id])
                anns = coco.loadAnns(annIds)
                ann = anns[0]['caption']
                self.image_ids_to_path[image_id] = [os.path.join(image_dir, file), ann]

    def __getitem__(self, index):
        image_id = self.image_ids[index]
        _, caption = self.image_ids_to_path[image_id]

        return image_id, caption

    def __len__(self):
        return len(self.image_ids)
    
    def show_images(batch: th.Tensor, brightness: float = 1.0):
        """ Display a batch of images inline with adjustable brightness. """
        
        # Check if the batch has 3 or 4 dimensions
        if batch.ndim == 3:
            batch = batch.unsqueeze(0)
        
        # Apply brightness adjustment
        batch = batch * brightness

        # Ensure the values are within the valid range for image display
        scaled = ((batch)*(127.5)).round().clamp(0,255).to(th.uint8).cpu()

        # Rearrange dimensions for image display
        reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])

        # Display the image
        display(Image.fromarray(reshaped.numpy()))
        
    def saveImages(self, dir, image_ids, images, brightness_factor=1.8):
        def pad_with_zeros(value, total_length):
            return "{:0>{}}".format(value, total_length)
        
        images = images * brightness_factor
        scaled = ((images)*(127.5)).round().clamp(0,255).to(th.uint8).cpu()

        # Iterate through each image and its corresponding ID in the batch
        for image_id, image in zip(image_ids, scaled):
            # Convert the PyTorch tensor to a PIL image
            image_pil = to_pil_image(image.cpu())

            # Save the image
            out = pad_with_zeros(image_id.item(), 12)
            image_pil.save(os.path.join(dir, str(out) + '.jpg'))


In [5]:
models = ['D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/Evaluation Models/dhariwal_sbucaptions_100_epoch18_ILAB.pt',
          'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/Evaluation Models/model_Lsimple_sbucaptions_v3.pt',
          ]

In [6]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

has_cuda = th.cuda.is_available()
device = th.device('cpu' if not has_cuda else 'cuda')
print(device)

cuda


In [7]:
time_steps = 100
guidance_scale = 3 
mscoco = MSCOCODataset('D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/val2017', annFile)

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!


In [9]:
# Load the model

options = model_and_diffusion_defaults()
options['use_fp16'] = False
options['diffusion_steps'] = time_steps # use 100 diffusion steps for fast sampling
options['num_channels'] = 96
options['num_head_channels'] = 32
options['num_res_blocks'] = 3
options['xf_width'] = 512
options['xf_layers'] = 16
options['xf_heads'] = 8
model, diffusion = create_model_and_diffusion(**options)
model.load_state_dict(th.load(models[1]))
model.eval()
model.to(device)
print('total base parameters', sum(x.numel() for x in model.parameters()))

total base parameters 156947494


In [10]:
# Load upsampling model
options_up = model_and_diffusion_defaults_upsampler()
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
model_up, diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()
if has_cuda:
    model_up.convert_to_fp16()
model_up.to(device)
model_up.load_state_dict(th.load('D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/Git/MiniGLIDE/glide_model_cache/upsample.pt'))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))

total upsampler parameters 398361286


In [11]:
# Model function for classifier free sampling
def model_fn(x_t, ts, guidance_scale=guidance_scale, **kwargs):
        half = x_t[: len(x_t) // 2]
        combined = th.cat([half, half], dim=0)
        model_out = model(combined, ts, **kwargs)
        eps, rest = model_out[:, :3], model_out[:, 3:]
        cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
        half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
        eps = th.cat([half_eps, half_eps], dim=0)
        return th.cat([eps, rest], dim=1)

In [12]:
from tqdm import tqdm

In [14]:
batch_size = 16
save_dir = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/model_out/miniGLIDEsimple'

dataloader = DataLoader(mscoco, batch_size=batch_size, shuffle=False)

for batch_idx, (image_ids, captions) in enumerate(tqdm(dataloader)):
    # print(image_ids)
    # print(captions)
    prompts = list(captions)

    # Create the text tokens to feed to the model.
    model_kwargs = get_model_kwargs_classifier_free(prompts, model, options, device)

    # Sample with classifier Free Guidance
    samples_CF = returnSample_CF(model, model_fn, model_kwargs, device, batch_size, options, diffusion, progress=False)[:batch_size]
    model_kwargs = get_model_kwargs_upsample(prompts, samples_CF, model, options, device)
    up_samples_CF = returnUpSample(model_up, diffusion_up, batch_size, device, model_kwargs, options_up, upsample_temp=1.0, cond_fn=None, progress=False)
    mscoco.saveImages(save_dir, image_ids, up_samples_CF)



100%|█████████▉| 312/313 [6:54:06<01:19, 79.64s/it]  


RuntimeError: The size of tensor a (32) must match the size of tensor b (16) at non-singleton dimension 0

In [15]:
import os
from pytorch_fid import fid_score
from PIL import Image

def resize_images(input_dir, output_dir, size=(256, 256)):
    os.makedirs(output_dir, exist_ok=True)
    for img_name in os.listdir(input_dir):
        img_path = os.path.join(input_dir, img_name)
        img = Image.open(img_path)
        img_resized = img.resize(size)
        img_resized.save(os.path.join(output_dir, img_name))

coco_dir = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/val2017'
resized_coco_dir = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/val2017_256'

# Resize the real images to the same size as generated images (256x256)
resize_images(coco_dir, resized_coco_dir)

In [16]:
def calculate_fid(directory1, directory2):
    device = 'cuda' if th.cuda.is_available() else 'cpu'
    fid_value = fid_score.calculate_fid_given_paths([directory1, directory2], 50, device, 2048)
    return fid_value


mini_GLIDE_simple_dir = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/model_out/miniGLIDEsimple'

In [17]:
# Let's first calculate the fid score for miniGLIDEsimple
fid_value = calculate_fid(mini_GLIDE_simple_dir, resized_coco_dir)
print(f"FID score: {fid_value}")

Downloading: "https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth" to C:\Users\Aaron/.cache\torch\hub\checkpoints\pt_inception-2015-12-05-6726825d.pth


  0%|          | 0.00/91.2M [00:00<?, ?B/s]

100%|██████████| 100/100 [00:27<00:00,  3.64it/s]
100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


FID score: 202.16732688719262


In [18]:
mini_GLIDE_dir = 'D:/Rutgers/Grad Courses/Natural Language Processing/Final Project - MiniGLIDE/GLIDE Local Recreation/GLIDE-Recreation/Evaluation/MSCOCO/images/model_out/iLab_epoch18'

# Now calculate the fid score for miniGLIDE
fid_value = calculate_fid(mini_GLIDE_dir, resized_coco_dir)
print(f"FID score: {fid_value}")

100%|██████████| 100/100 [00:28<00:00,  3.49it/s]
100%|██████████| 100/100 [00:13<00:00,  7.65it/s]


FID score: 145.76802162377993
