In [1]:
from PIL import Image
from torchvision.transforms.functional import pil_to_tensor
import torch

size = (512, 512) # - InceptionScore, FRECHET INCEPTION DISTANCE;
img_real = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000377132.jpg")
img_real = img_real.resize(size, resample=Image.BILINEAR)
img_real_tensor = pil_to_tensor(img_real).type(torch.uint8)
img_real_tensor = torch.unsqueeze(img_real_tensor, 0)
img_real_1 = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000274035.jpg")
img_real_1 = img_real_1.resize(size, resample=Image.BILINEAR)
img_real_1_tensor = pil_to_tensor(img_real_1).type(torch.uint8)
img_real_1_tensor = torch.unsqueeze(img_real_1_tensor, 0)
imgs_real = torch.cat((img_real_tensor, img_real_1_tensor), 0)
img_real_1 = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000037552.jpg")
img_real_1 = img_real_1.resize(size, resample=Image.BILINEAR)
img_real_1_tensor = pil_to_tensor(img_real_1).type(torch.uint8)
img_real_1_tensor = torch.unsqueeze(img_real_1_tensor, 0)
imgs_real = torch.cat((imgs_real, img_real_1_tensor), 0)

img_fake = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000472617.jpg")
img_fake = img_real.resize(size, resample=Image.BILINEAR)
img_fake_tensor = pil_to_tensor(img_fake).type(torch.uint8)
img_fake_tensor = torch.unsqueeze(img_fake_tensor, 0)
img_fake_1 = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000346759.jpg")
img_fake_1 = img_fake_1.resize(size, resample=Image.BILINEAR)
img_fake_1_tensor = pil_to_tensor(img_fake_1).type(torch.uint8)
img_fake_1_tensor = torch.unsqueeze(img_fake_1_tensor, 0)
imgs_fake = torch.cat((img_fake_tensor, img_fake_1_tensor), 0)
img_fake_1 = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000158028.jpg")
img_fake_1 = img_fake_1.resize(size, resample=Image.BILINEAR)
img_fake_1_tensor = pil_to_tensor(img_fake_1).type(torch.uint8)
img_fake_1_tensor = torch.unsqueeze(img_fake_1_tensor, 0)
imgs_fake = torch.cat((imgs_fake, img_fake_1_tensor), 0)

In [2]:
imgs_real.size(), imgs_fake.size()

(torch.Size([3, 3, 512, 512]), torch.Size([3, 3, 512, 512]))

In [9]:
# compute Inception Score
"""
Higher Inception Score indicates more varied and recognizable generated samples. 
Lower FID score means the distribution of generated data is closer to real data based on feature embeddings.
The inception score (IS) is a mathematical algorithm used to measure or determine the quality of images created by 
generative AI through a generative adversarial network (GAN). The word "inception" refers to the spark of creativity or 
initial beginning of a thought or action traditionally experienced by humans.
The score produced by the IS algorithm can range from zero (worst) to infinity (best).
"""

from torchmetrics.image.inception import InceptionScore
import torch

imgs = imgs.to(torch.device("cuda", 0))
inception = InceptionScore(compute_with_cache=False).to(torch.device("cuda", 0))
# generate some images
#imgs = torch.randint(0, 255, (100, 3, 299, 299), dtype=torch.uint8)

inception.update(imgs)
inception.compute()

(tensor(1., device='cuda:0'), tensor(0., device='cuda:0'))

In [5]:
# compute Frechet Inception Distance
"""
Fréchet inception distance (FID) is a metric for quantifying the realism and diversity of images generated by 
generative adversarial networks (GANs). Realistic could mean that generated images of people look like real images of people.
Unlike the earlier inception score (IS), which evaluates only the distribution of generated images, 
the FID compares the distribution of generated images with the distribution of a set of real images ("ground truth"). 
The FID metric does not completely replace the IS metric. Classifiers that achieve the best (lowest) FID score tend to have 
greater sample variety while classifiers achieving the best (highest) IS score tend to have better quality within individual 
images.
A lower FID indicates a better match between the generated images and the real images in terms of their visual quality 
and diversity.
"""

from torchmetrics.image.fid import FrechetInceptionDistance
import torch

fid = FrechetInceptionDistance(reset_real_features=False, compute_with_cache=False).to(torch.device("cuda", 0))

# generate two slightly overlapping image intensity distributions
# imgs_dist1 = torch.randint(0, 200, (100, 3, 299, 299), dtype=torch.uint8)
# imgs_dist2 = torch.randint(100, 255, (100, 3, 299, 299), dtype=torch.uint8)

imgs_real = imgs_real.to(torch.device("cuda", 0))
imgs_fake = imgs_fake.to(torch.device("cuda", 0))

fid.update(imgs_real, real=True)
fid.update(imgs_fake, real=False)
fid.compute()

tensor(377.5616, device='cuda:0')

In [None]:
# compute LEARNED PERCEPTUAL IMAGE PATCH SIMILARITY
"""
The Learned Perceptual Image Patch Similarity (LPIPS_) calculates perceptual similarity between two images.
LPIPS essentially computes the similarity between the activations of two image patches for some pre-defined network. 
This measure has been shown to match human perception well. A low LPIPS score means that image patches are perceptual similar.
Both input image patches are expected to have shape (N, 3, H, W). The minimum size of H, W depends on the chosen backbone 
(see net_type arg).
"""
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
import torch

# net_type: 'vgg(16)', 'alex(net-owt)', 'squeeze'
lpips = LearnedPerceptualImagePatchSimilarity(net_type='squeeze', compute_with_cache=False).to(torch.device("cuda", 0)) 

# LPIPS needs the images to be in the [-1, 1] range.
img_real_1_tensor = img_real_1_tensor.to(torch.device("cuda", 0))
img_real_1_tensor = img_real_1_tensor.type(torch.float32) * 2 / 255 - 1
img_fake_1_tensor = img_fake_1_tensor.to(torch.device("cuda", 0))
img_fake_1_tensor = img_fake_1_tensor.type(torch.float32) * 2 / 255 - 1
# batch images imgs_real, imgs_fake also possible

lpips(img_real_1_tensor, img_fake_1_tensor)

In [2]:
# compute PEAK SIGNAL-TO-NOISE RATIO
"""
Peak signal-to-noise ratio (PSNR) is an engineering term for the ratio between the maximum possible power of a 
signal and the power of corrupting noise that affects the fidelity of its representation. Because many signals have a 
very wide dynamic range, PSNR is usually expressed as a logarithmic quantity using the decibel scale.
PSNR is commonly used to quantify reconstruction quality for images and video subject to lossy compression.
For color images with three RGB values per pixel, the definition of PSNR is the same except that the MSE is the sum over all 
squared value differences (now for each color, i.e. three times as many differences as in a monochrome image) divided by 
image size and by three. Alternately, for color images the image is converted to a different color space and PSNR is reported 
against each channel of that color space, e.g., YCbCr or HSL.
PSNR is most commonly used to measure the quality of reconstruction of lossy compression codecs (e.g., for image compression). 
The signal in this case is the original data, and the noise is the error introduced by compression. When comparing compression 
codecs, PSNR is an approximation to human perception of reconstruction quality.
Typical values for the PSNR in lossy image and video compression are between 30 and 50 dB, provided the bit depth is 8 bits, 
where higher is better. The processing quality of 12-bit images is considered high when the PSNR value is 60 dB or higher. 
For 16-bit data typical values for the PSNR are between 60 and 80 dB. Acceptable values for wireless transmission quality loss 
are considered to be about 20 dB to 25 dB.
In the absence of noise, the two images I and K are identical, and thus the MSE is zero. In this case the PSNR is infinite.
"""

from torchmetrics.image import PeakSignalNoiseRatio
import torch
# , reduction=None, dim=[1,2,3], data_range=255
psnr = PeakSignalNoiseRatio(compute_with_cache=False).to(torch.device("cuda", 0))

imgs_real = imgs_real.to(torch.device("cuda", 0))
imgs_fake = imgs_fake.to(torch.device("cuda", 0))
# img_real_1_tensor = img_real_1_tensor.to(torch.device("cuda", 0))
# img_fake_1_tensor = img_fake_1_tensor.to(torch.device("cuda", 0))

psnr(imgs_fake, imgs_real)

tensor(29.5404, device='cuda:0')

In [7]:
# compute STRUCTURAL SIMILARITY INDEX MEASURE
"""
The structural similarity index measure (SSIM) is a method for predicting the perceived quality of digital television and 
cinematic pictures, as well as other kinds of digital images and videos. It is also used for measuring the similarity between 
two images. The SSIM index is a full reference metric; in other words, the measurement or prediction of image quality is based 
on an initial uncompressed or distortion-free image as reference.
SSIM is a perception-based model that considers image degradation as perceived change in structural information, while also 
incorporating important perceptual phenomena, including both luminance masking and contrast masking terms. The difference with 
other techniques such as MSE or PSNR is that these approaches estimate absolute errors. Structural information is the idea 
that the pixels have strong inter-dependencies especially when they are spatially close. These dependencies carry important 
information about the structure of the objects in the visual scene. Luminance masking is a phenomenon whereby image distortions
(in this context) tend to be less visible in bright regions, while contrast masking is a phenomenon whereby distortions become 
less visible where there is significant activity or "texture" in the image.
This system calculates the Structural Similarity Index between 2 given images which is a value between -1 and +1. 
A value of +1 indicates that the 2 given images are very similar or the same while a value of -1 indicates the 2 given images 
are very different. Often these values are adjusted to be in the range [0, 1], e.g., in torchmetrics, where the extremes 
hold the same meaning.
"""

from torchmetrics.image import StructuralSimilarityIndexMeasure
import torch 

ssim = StructuralSimilarityIndexMeasure(compute_with_cache=False).to(torch.device("cuda", 0))

# imgs_real = imgs_real.to(torch.device("cuda", 0))
# imgs_real = imgs_real.type(torch.float32)
# imgs_fake = imgs_fake.to(torch.device("cuda", 0))
# imgs_fake = imgs_fake.type(torch.float32)
img_real_1_tensor = img_real_1_tensor.to(torch.device("cuda", 0))
img_real_1_tensor = img_real_1_tensor.type(torch.float32)
img_fake_1_tensor = img_fake_1_tensor.to(torch.device("cuda", 0))
img_fake_1_tensor = img_fake_1_tensor.type(torch.float32)

ssim(img_fake_1_tensor, img_real_1_tensor)

tensor(0.1832, device='cuda:0')

In [7]:
# compute CLIP score  hardly to use
"""
CLIP Score is a reference-free metric that can be used to evaluate the correlation between a generated caption for an 
image and the actual content of the image. It corresponds to the cosine similarity between visual CLIP embedding for an image 
and textual CLIP embedding for a caption. The score is bound between 0 and 100 and the closer to 100 the better.
"""

from PIL import Image
import torch
from torchmetrics.multimodal.clip_score import CLIPScore
from torchvision.transforms.functional import pil_to_tensor

prompt = "A bathroom, a white bathtub takes center stage, positioned directly "\
"beneath a mirror that stretches across the wall. The room is clad in gleaming white tiles, which reflect the soft light and "\
"create a sense of calm. A glass shower door adorns the bathtub. A few plush towels "\
"are carefully laid out nearby, with one draped casually over the side of the tub. "
# To complete the tranquil atmosphere, a mirror stands adjacent to the tub, its reflective surface bouncing light and creating the illusion of a more spacious area

img = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000377132.jpg")
img_tensor = pil_to_tensor(img).type(torch.uint8)
img_tensor = torch.unsqueeze(img_tensor, 0).to(torch.device("cuda", 0))

metric = CLIPScore(model_name_or_path="openai/clip-vit-large-patch14", 
                   compute_with_cache=False).to(torch.device("cuda", 0))

score = metric(img_tensor, prompt)
score.detach()

In [3]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"  # specify which GPU(s) to be used
# inside the code:
# device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
# device = torch.device("cuda:1,3" if torch.cuda.is_available() else "cpu")

import image_similarity_metrics as ism

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
ism.compute_SSIM(imgs_real, imgs_fake, device)

tensor(0.4596, device='cuda:0')

In [None]:
from PIL import Image
import torch
import image_similarity_metrics as ism
from torchvision.transforms.functional import pil_to_tensor

prompt = "A bathroom, a white bathtub takes center stage, positioned directly "\
"beneath a mirror that stretches across the wall. The room is clad in gleaming white tiles, which reflect the soft light and "\
"create a sense of calm. A glass shower door adorns the bathtub. A few plush towels "\
"are carefully laid out nearby, with one draped casually over the side of the tub. "
# To complete the tranquil atmosphere, a mirror stands adjacent to the tub, its reflective surface bouncing light and creating the illusion of a more spacious area

img = Image.open("/home/jovyan/DFBench/data/IMAGEs/MSCOCO/selected/000000377132.jpg")
img_tensor = pil_to_tensor(img).type(torch.uint8)
img_tensor = torch.unsqueeze(img_tensor, 0)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

ism.compute_CLIP_SCORE(prompt, img_tensor, device)