## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import time
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import matplotlib.pyplot as plt
import sys
from torchvision.transforms import functional as TF
import torchvision.transforms as transforms
import random
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import scipy.io as sio
import cv2
import cudacanvas
import torch.nn.functional as F
from skimage.metrics import peak_signal_noise_ratio as compare_psnr
from skimage.metrics import structural_similarity as compare_ssim
import tarfile


## Swift SRGAN model  

In [13]:
class SeperableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, bias=True):
        super(SeperableConv2d, self).__init__()
        self.depthwise = nn.Conv2d(
            in_channels,
            in_channels,
            kernel_size=kernel_size,
            stride = stride,
            groups=in_channels,
            bias=bias,
            padding=padding
        )
        self.pointwise = nn.Conv2d(
            in_channels,
            out_channels, 
            kernel_size=1,
            bias=bias
        )
    def forward(self, x):
        return self.pointwise(self.depthwise(x))
    

    
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, use_act=True, use_bn=True, discriminator=False, **kwargs):
        super(ConvBlock, self).__init__()
        
        self.use_act = use_act
        self.cnn = SeperableConv2d(in_channels, out_channels, **kwargs, bias=not use_bn)
        self.bn = nn.BatchNorm2d(out_channels) if use_bn else nn.Identity()
        self.act = nn.LeakyReLU(0.2, inplace=True) if discriminator else nn.PReLU(num_parameters=out_channels)
        
    def forward(self, x):
        return self.act(self.bn(self.cnn(x))) if self.use_act else self.bn(self.cnn(x))


class UpsampleBlock(nn.Module):
    def __init__(self, in_channels, scale_factor):
        super(UpsampleBlock, self).__init__()
        
        self.conv = SeperableConv2d(in_channels, in_channels * scale_factor**2, kernel_size=3, stride=1, padding=1)
        self.ps = nn.PixelShuffle(scale_factor) # (in_channels * 4, H, W) -> (in_channels, H*2, W*2)
        self.act = nn.PReLU(num_parameters=in_channels)
    
    def forward(self, x):
        return self.act(self.ps(self.conv(x)))
        

class ResidualBlock(nn.Module):
    def __init__(self, in_channels):
        super(ResidualBlock, self).__init__()
        
        self.block1 = ConvBlock(
            in_channels,
            in_channels,
            kernel_size=3,
            stride=1,
            padding=1
        )
        self.block2 = ConvBlock(
            in_channels,
            in_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            use_act=False
        )
        
    def forward(self, x):
        out = self.block1(x)
        out = self.block2(out)
        return out + x
    
    
class Generator(nn.Module):
    """Swift-SRGAN Generator
    Args:
        in_channels (int): number of input image channels.
        num_channels (int): number of hidden channels.
        num_blocks (int): number of residual blocks.
        upscale_factor (int): factor to upscale the image [2x, 4x, 8x].
    Returns:
        torch.Tensor: super resolution image
    """

    def __init__(self, in_channels: int = 3, num_channels: int = 64, num_blocks: int = 16, upscale_factor: int = 4):
        super(Generator, self).__init__()
        
        self.initial = ConvBlock(in_channels, num_channels, kernel_size=9, stride=1, padding=4, use_bn=False)
        self.residual = nn.Sequential(
            *[ResidualBlock(num_channels) for _ in range(num_blocks)]
        )
        self.convblock = ConvBlock(num_channels, num_channels, kernel_size=3, stride=1, padding=1, use_act=False)
        self.upsampler = nn.Sequential(
            *[UpsampleBlock(num_channels, scale_factor=2) for _ in range(upscale_factor//2)]
        )
        self.final_conv = SeperableConv2d(num_channels, in_channels, kernel_size=9, stride=1, padding=4)
        
    def forward(self, x):
        initial = self.initial(x)
        x = self.residual(initial)
        x = self.convblock(x) + initial
        x = self.upsampler(x)
        return (torch.tanh(self.final_conv(x)) + 1) / 2


class Discriminator(nn.Module):
    """Swift-SRGAN Discriminator
    Args:
        in_channels (int): number of input image channels.
        features (tuple): sequence of hidden channels.
    Returns:
        torch.Tensor
    """

    def __init__(
        self,
        in_channels: int = 3,
        features: tuple = (64, 64, 128, 128, 256, 256, 512, 512),
    ) -> None:
        super(Discriminator, self).__init__()

        blocks = []
        for idx, feature in enumerate(features):
            blocks.append(
                ConvBlock(
                    in_channels,
                    feature,
                    kernel_size=3,
                    stride=1 + idx % 2,
                    padding=1,
                    discriminator=True,
                    use_act=True,
                    use_bn=False if idx == 0 else True,
                )
            )
            in_channels = feature

        self.blocks = nn.Sequential(*blocks)
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((6, 6)),
            nn.Flatten(),
            nn.Linear(512 * 6 * 6, 1024),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(1024, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.blocks(x)
        return torch.sigmoid(self.classifier(x))

upscale_factor = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Generator(upscale_factor=upscale_factor).to(device)

# Load model weigths
# torch.load("swift_srgan_4x")
    
model.load_state_dict(torch.load("swift_srgan_2x.pth.tar")['model'])
model.eval()
# Print number of parameters

Generator(
  (initial): ConvBlock(
    (cnn): SeperableConv2d(
      (depthwise): Conv2d(3, 3, kernel_size=(9, 9), stride=(1, 1), padding=(4, 4), groups=3)
      (pointwise): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1))
    )
    (bn): Identity()
    (act): PReLU(num_parameters=64)
  )
  (residual): Sequential(
    (0): ResidualBlock(
      (block1): ConvBlock(
        (cnn): SeperableConv2d(
          (depthwise): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (pointwise): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        )
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): PReLU(num_parameters=64)
      )
      (block2): ConvBlock(
        (cnn): SeperableConv2d(
          (depthwise): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (pointwise): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=Fa

## Handle Data

### Function to get cursor position

In [3]:
# Mouse callback function to update cursor position
def update_cursor_position(event, x, y, flags, param):
    global cursor_x, cursor_y
    if event == cv2.EVENT_MOUSEMOVE:
        cursor_x, cursor_y = x, y

In [16]:
# Define transform to be applied to frames
# Currently only transforms.ToTensor()
transform = transforms.Compose([
    transforms.ToTensor(),
])

def upscale_video(video_path, model, transform = None, out_video_path = None, evaluate_mode = False):
    """
    Upscale a video 3x by using bicubic plus plus model
    - video_path: path to the video
    - model: bicubic plus plus model
    - transform: transform to be applied to frames
    - out_video_path: path to the output video (only if evaluate_mode is True)
    - evaluate_mode: if True, model is used to write the video to storage so that it can be evaluated.
    - evaluate mode is slow due to GPu -> CPU transfer overhead

    - Output: Upscaled Video feed or File storage if in evaluate mode 
    """
    # Set CUDA device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load video file
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define codec and VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video = cv2.VideoWriter(out_video_path, fourcc, fps, (frame_width*upscale_factor, frame_height*upscale_factor))


    # Set up cudacanvas window for renders
    if not evaluate_mode:
        white_screen = torch.ones((3, int(frame_height/6*upscale_factor), int(frame_width/6*upscale_factor))).to(device)
        cudacanvas.set_image(white_screen)
        cudacanvas.create_window()

    # Process each frame
    while cap.isOpened():
        elapsed_time = 0
        torch.cuda.synchronize()
        start_time = time.time()
        # Read frame
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_downsampled = cv2.resize(frame, (int(frame.shape[1] / 6), int(frame.shape[0] / 6))) 
        
        # Convert frame to RGB and apply transform
        frame_rgb = frame_downsampled
        frame_rgb = cv2.cvtColor(frame_rgb, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame_rgb).unsqueeze(0).to(device)

        frame_bicubic = cv2.resize(frame_rgb, (frame_downsampled.shape[1] * upscale_factor, frame_downsampled.shape[0] * upscale_factor))
        # frame_bicubic = cv2.cvtColor(frame_bicubic, cv2.COLOR_BGR2RGB)
        frame_bicubic = transform(frame_bicubic).unsqueeze(0).to(device)

        # Perform super-resolution on the frame
        with torch.no_grad():
            upscaled_frame_tensor = model(frame_tensor)
            upscaled_frame_tensor = torch.clamp(upscaled_frame_tensor, 0, 1)
            # print(frame_downsampled.shape, frame_bicubic.shape, frame_tensor.shape, upscaled_frame_tensor.shape)
        
        # print(frame_downsampled.shape, frame_bicubic.shape, frame_tensor.shape, upscaled_frame_tensor.shape)
        
        # Write the upscaled frame to output video file if in evaluate mode
        if evaluate_mode:
            # Convert tensor back to numpy array
            upscaled_frame = (upscaled_frame_tensor.squeeze().cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
            upscaled_frame = cv2.cvtColor(upscaled_frame, cv2.COLOR_RGB2BGR)
            # cv2.imshow('Upscaled Frame', upscaled_frame)
            # # Write upscaled frame to output video
            output_video.write(upscaled_frame)
        
        # If no evaluation, just output using cudacanvas
        else:
            cudacanvas.render()
            # Concatenate frame and upscaled frame
            combine = torch.concat([frame_bicubic, upscaled_frame_tensor], dim = -1)
            cudacanvas.set_image(upscaled_frame_tensor.squeeze())
            if cudacanvas.should_close():
                break
        
        
        cv2.waitKey(1)
        if evaluate_mode:
            if cv2.waitKey(0) & 0xFF == ord('q'):
                break

    # Release resources
    cap.release()
    output_video.release()
    cv2.destroyAllWindows()

# testing
video_path = 'test_videos/4K ULtra HD ｜ SAMSUNG UHD Demo׃ LED TV [R3GfuzLMPkA].mp4' 
upscale_video(video_path, model, transform = transform, evaluate_mode = False)

## Calculate average FPS for 1000 frames

In [12]:
noise = torch.randn(1, 3, 180, 270).to(device)
times = []
for i in tqdm(range(10)):
  torch.cuda.synchronize()
  start = time.time()
  with torch.no_grad():
    pred = model(noise)
    pred = torch.clamp(pred, 0, 1)
  torch.cuda.synchronize()
  end = time.time() - start
  times.append(end)

# plt.imshow(pred.squeeze().cpu().numpy().transpose(1, 2, 0))
avg_time = np.mean(times)

print("Input frame size =", noise.shape)
print("Output frame size =", pred.shape)
print("Average Time per frame =", 1000*avg_time, "ms")
print("Average FPS =", 1/avg_time, "FPS")

100%|██████████| 10/10 [00:01<00:00,  7.51it/s]

Input frame size = torch.Size([1, 3, 180, 270])
Output frame size = torch.Size([1, 3, 720, 1080])
Average Time per frame = 130.43982982635498 ms
Average FPS = 7.666370013907768 FPS





# Get evaluation metrics

## Performance Timings
- Uncomment each line to test individual transfer latencies

In [6]:
import torch
import time

tot_time = 0

# input tensor (360p)
frame_tensor_cpu = torch.randn(1,3,960,720)
frame_tensor_gpu = frame_tensor_cpu.to('cuda')
frame_tensor_cpu_3x = torch.randn(1,3,2160,3840)
frame_tensor_gpu_3x = frame_tensor_cpu_3x.to('cuda')
frame_tensor_shared = frame_tensor_cpu_3x.pin_memory()
shared_ref = torch.zeros(1,3,2160,3840).pin_memory()

for i in tqdm(range(100)):
    st = time.time()
    # frame_tensor_cpu.to('cuda') # 1.4 ms - normal cpu to gpu / 24.240
    # frame_tensor_shared.to('cuda') # 9 ms - pinned cpu to gpu
    # shared_ref[:] = frame_tensor_cpu_3x # 14.26 ms - normal cpu to pinned cpu
    # shared_ref[:] = frame_tensor_gpu_3x # 8.07 ms - gpu to pinned cpu
    # shared_ref.to('cpu') # 0.001 ms
    frame_tensor_gpu_3x.to('cpu') # 16.12 ms - gpu to normal cpu
    
    tot_time += time.time() - st
print('Average time (ms):', tot_time/i*1000)    
    

100%|██████████| 10/10 [00:00<00:00, 66.76it/s]

Average time (ms): 15.644576814439562





## Quality Metrics
- PSNR
- SSIM
- Inference Time

In [17]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

def evaluate_model(video_path, model, crop_size = [1920, 1080], upscale_factor = 3, n_samples = 10, transform = None):
    """
    - This function samples a random area of 'crop_size' from the video. 
    - This is then downsampled by 3x and then upscaled back to 'crop_size'
    - This is then compared to the original cropped frame
    - The baseline is the regular bicubic upsampling

    Inputs:
    - video_path: path to the test 4k video
    - model: Super res model
    - n_samples: number of samples to evaluate
    - transform: transform to use

    Outputs:
    - PSNR: Average Peak Signal to Noise Ratio
    - SSIM: Average Structural Similarity Index
    - infer_time: Average inference time
    """
    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load video
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("Video properties:")
    print("Frame width:", frame_width)
    print("Frame height:", frame_height)
    print("FPS:", fps)

    # Read and process n_samples of frames
    n = 0
    psnr_bicubic_list = []
    psnr_sr_list = []
    ssim_bicubic_list = []
    ssim_sr_list = []
    infer_time_bicubic_list = []
    infer_time_sr_list = []

    while True and n < n_samples:
        # Read frame 
        ret, frame = cap.read()
        
        if np.amax(frame) == 0 and np.amin(frame) == 0:
            continue
        n += 1
        # if n == 1:
        #     continue
        if not ret:
            break

        crop_width, crop_height = crop_size
        # Get cropped region of frame
        x_start = random.randint(0, frame_width - crop_width)
        y_start = random.randint(0, frame_height - crop_height)
        x_end = x_start + crop_width
        y_end = y_start + crop_height

        

        # Crop the frame
        cropped_frame = frame[y_start:y_end, x_start:x_end]

        # Downsample frame
        downsampled_frame = cv2.resize(cropped_frame, (int(cropped_frame.shape[1] / upscale_factor), int(cropped_frame.shape[0] / upscale_factor)))

        # Upsample frame
        torch.cuda.synchronize()
        time.time()
        upscaled_frame_bicubic = cv2.resize(downsampled_frame, None, fx=upscale_factor, fy=upscale_factor, interpolation=cv2.INTER_CUBIC)
        torch.cuda.synchronize()
        infer_time_bicubic = time.time() - time.time()
        # print(frame.shape, cropped_frame.shape, downsampled_frame.shape, upscaled_frame_bicubic.shape)

        # Convert frame to tensor
        frame_rgb = cv2.cvtColor(downsampled_frame, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame_rgb).unsqueeze(0).to(device)

        # Perform super-resolution on the frame
        torch.cuda.synchronize()
        start = time.time()
        with torch.no_grad():
            upscaled_frame_sr = model(frame_tensor)
            upscaled_frame_sr = torch.clamp(upscaled_frame_sr, 0, 1)
        torch.cuda.synchronize()
        infer_time_sr = time.time() - start

        # Convert tensor back to numpy array
        upscaled_frame_sr = (upscaled_frame_sr.squeeze().cpu().numpy().transpose(1, 2, 0))# * 255).astype(np.uint8)
        upscaled_frame_sr = cv2.cvtColor(upscaled_frame_sr, cv2.COLOR_RGB2BGR)
        
        # Calculate PSNR
        psnr_bicubic = compare_psnr(cropped_frame, upscaled_frame_bicubic)
        psnr_sr = compare_psnr(cropped_frame/255, upscaled_frame_sr)

        # Calculate SSIM
        ssim_bicubic = compare_ssim(cropped_frame, upscaled_frame_bicubic, channel_axis=-1, data_range=1, multichannel=True)
        ssim_sr = compare_ssim(cropped_frame/255, upscaled_frame_sr, channel_axis=-1, data_range=1, multichannel=True)

        print(psnr_bicubic, psnr_sr, ssim_bicubic, ssim_sr, infer_time_bicubic, infer_time_sr)

        psnr_bicubic_list.append(psnr_bicubic)
        psnr_sr_list.append(psnr_sr)
        ssim_bicubic_list.append(ssim_bicubic)
        ssim_sr_list.append(ssim_sr)
        infer_time_bicubic_list.append(infer_time_bicubic)
        infer_time_sr_list.append(infer_time_sr)

        # Display the upscaled frame by concatenating the original and upscaled frames
        # top = np.concatenate((cropped_frame/255, cropped_frame/255), axis=1)
        # bottom = np.concatenate((upscaled_frame_bicubic/255, upscaled_frame_sr), axis=1)
        # combined = np.concatenate((top, bottom), axis=0)
        # cv2.imshow('frame', combined)
        
        # # cv2.imshow('frame', upscaled_frame_sr)
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break
        

    # Calculate average PSNR and SSIM only for values that are not Nan or Inf
    psnr_bicubic_list = [psnr for psnr in psnr_bicubic_list if not np.isnan(psnr) and not np.isinf(psnr)]
    psnr_sr_list = [psnr for psnr in psnr_sr_list if not np.isnan(psnr) and not np.isinf(psnr)]
    ssim_bicubic_list = [ssim for ssim in ssim_bicubic_list if not np.isnan(ssim) and not np.isinf(ssim)]
    ssim_sr_list = [ssim for ssim in ssim_sr_list if not np.isnan(ssim) and not np.isinf(ssim)]
    infer_time_bicubic_list = [infer_time for infer_time in infer_time_bicubic_list if not np.isnan(infer_time) and not np.isinf(infer_time)]
    infer_time_sr_list = [infer_time for infer_time in infer_time_sr_list if not np.isnan(infer_time) and not np.isinf(infer_time)]

    avg_psnr_bicubic = np.mean(psnr_bicubic_list)
    avg_psnr_sr = np.mean(psnr_sr_list)
    avg_ssim_bicubic = np.mean(ssim_bicubic_list)
    avg_ssim_sr = np.mean(ssim_sr_list)
    avg_infer_time_bicubic = np.mean(infer_time_bicubic_list)
    avg_infer_time_sr = np.mean(infer_time_sr_list)


    return avg_psnr_bicubic, avg_psnr_sr, avg_ssim_bicubic, avg_ssim_sr, avg_infer_time_bicubic, avg_infer_time_sr


video_path = "./test_videos/4K ULtra HD ｜ SAMSUNG UHD Demo׃ LED TV [R3GfuzLMPkA].mp4"
avg_psnr_bicubic, avg_psnr_sr, avg_ssim_bicubic, avg_ssim_sr, avg_infer_time_bicubic, avg_infer_time_sr = evaluate_model(video_path, model, transform=transform, n_samples=5
                                                                                                                         , crop_size=[1920,1080], upscale_factor=2)        
print("Average PSNR (bicubic):", avg_psnr_bicubic)
print("Average PSNR (SR):", avg_psnr_sr)
print("Average SSIM (bicubic):", avg_ssim_bicubic)
print("Average SSIM (SR):", avg_ssim_sr)
print("Average inference time (bicubic):", avg_infer_time_bicubic)
print("Average inference time (SR):", avg_infer_time_sr)

Video properties:
Frame width: 3840
Frame height: 2160
FPS: 30.00101354775499


  psnr_sr = compare_psnr(cropped_frame/255, upscaled_frame_sr)


49.06864156491552 45.62586493401043 0.8491064791095337 0.9779791921613245 0.0 0.8649909496307373
47.793950782489816 43.3488291220111 0.8788903816684389 0.9690509909448076 0.0 0.62111496925354
52.567150326890264 48.10033902759245 0.8416854126756786 0.9776448964695726 0.0 0.6278104782104492
46.97751226341669 42.23897639133946 0.8355417269577662 0.9458024114258546 0.0 0.6323878765106201
50.427247804790966 46.697468405610195 0.8378131971088335 0.9831677276174764 0.0 0.6334424018859863
Average PSNR (bicubic): 49.36690054850065
Average PSNR (SR): 45.202295576112725
Average SSIM (bicubic): 0.8486074395040502
Average SSIM (SR): 0.9707290437238072
Average inference time (bicubic): 0.0
Average inference time (SR): 0.6759493350982666


# Get sample image for report

In [8]:
img_path = "./test_videos/2022-Formula1-Aston-Martin-AMR22-006-2160.jpg"
img_path = "test_videos/4K ULtra HD ｜ SAMSUNG UHD Demo׃ LED TV [R3GfuzLMPkA].mp4"
# image = cv2.imread(img_path)
cap = cv2.VideoCapture(img_path)

cursor_x = -1
cursor_y = -1
drawing = False

# Mouse callback function
def mouse_callback(event, x, y, flags, param):
    global cursor_x, cursor_y
    if event == cv2.EVENT_MOUSEMOVE:
        cursor_x = x
        cursor_y = y
    elif event == cv2.EVENT_LBUTTONDOWN:
        drawing = True

cv2.namedWindow('BBoxWindow')
cv2.namedWindow('Cropped frame')
# cv2.namedWindow("SR Window")
cv2.setMouseCallback('BBoxWindow', mouse_callback)

bb_size = [270,180]
cropped_image_bicubic = np.zeros((bb_size[1]*4, bb_size[0]*4, 3))
combined = np.zeros((bb_size[1]*4*2, bb_size[0]*4, 3))
upscaled_frame_sr = np.zeros((bb_size[1]*4, bb_size[0]*4, 3))
while True:
    ret, image = cap.read()
    # image = cv2.resize(image, (1920,1080))
    if not ret:
        break
    # Draw bounding box based on cursor position
    if cursor_x != -1 and cursor_y != -1:
        # top_left = (max(0,cursor_x - bb_size[0]//2), max(0,cursor_y - bb_size[1]//2))
        # bottom_right = (min(image.shape[0],cursor_x + bb_size[0]//2), min(image.shape[1], cursor_y + bb_size[1]//2))
        
        top_left = (cursor_x - bb_size[0]//2, cursor_y - bb_size[1]//2)
        bottom_right = (cursor_x + bb_size[0]//2 , cursor_y + bb_size[1]//2)
        
        # crop image only for the bounding box region
        if top_left[0] >= 0 and top_left[1] >= 0 and bottom_right[0] < image.shape[1] and bottom_right[1] < image.shape[0]:
            cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]]
            cropped_image_bicubic = cv2.resize(cropped_image, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)

        # Convert frame to tensor
        frame_rgb = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame_rgb).unsqueeze(0).to(device)

        with torch.no_grad():
            upscaled_frame_sr = model(frame_tensor)
            upscaled_frame_sr = torch.clamp(upscaled_frame_sr, 0, 1)

        # Convert tensor back to numpy array
        upscaled_frame_sr = (upscaled_frame_sr.squeeze().cpu().numpy().transpose(1, 2, 0))# * 255).astype(np.uint8)
        upscaled_frame_sr = cv2.cvtColor(upscaled_frame_sr, cv2.COLOR_RGB2BGR)

        # Stack horizontally
        combined = np.concatenate((cropped_image_bicubic/255, upscaled_frame_sr), axis=1)
        
        # image_copy = image.copy()
        cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)
        

        

    # Display the image
    cv2.imshow('BBoxWindow', image)
    cv2.imshow('Cropped frame', combined)
    cv2.imshow("SR Window", upscaled_frame_sr)
    

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cv2.destroyAllWindows()

In [9]:
img_path = "./test_videos/2022-Formula1-Aston-Martin-AMR22-006-2160.jpg"

image = cv2.imread(img_path)
cursor_x = -1
cursor_y = -1
save = False

# Mouse callback function
def mouse_callback(event, x, y, flags, param):
    global cursor_x, cursor_y
    if event == cv2.EVENT_MOUSEMOVE:
        cursor_x = x
        cursor_y = y
    elif event == cv2.EVENT_LBUTTONDOWN:
        save = True

cv2.namedWindow('BBoxWindow')
cv2.namedWindow('Result frame')
# cv2.namedWindow("SR Window")
cv2.setMouseCallback('BBoxWindow', mouse_callback)

while True:
    image = cv2.imread(img_path)
    
    # Draw bounding box based on cursor position
    if cursor_x != -1 and cursor_y != -1:
        # top_left = (max(0,cursor_x - bb_size[0]//2), max(0,cursor_y - bb_size[1]//2))
        # bottom_right = (min(image.shape[0],cursor_x + bb_size[0]//2), min(image.shape[1], cursor_y + bb_size[1]//2))
        
        top_left = (cursor_x - bb_size[0]//2, cursor_y - bb_size[1]//2)
        bottom_right = (cursor_x + bb_size[0]//2 , cursor_y + bb_size[1]//2)
        
        # crop image only for the bounding box region
        if top_left[0] >= 0 and top_left[1] >= 0 and bottom_right[0] < image.shape[1] and bottom_right[1] < image.shape[0]:
            cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]]
            cropped_image_bicubic = cv2.resize(cropped_image, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)

        # Convert frame to tensor
        frame_rgb = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)
        frame_tensor = transform(frame_rgb).unsqueeze(0).to(device)

        with torch.no_grad():
            upscaled_frame_sr = model(frame_tensor)
            upscaled_frame_sr = torch.clamp(upscaled_frame_sr, 0, 1)

        # Convert tensor back to numpy array
        upscaled_frame_sr = (upscaled_frame_sr.squeeze().cpu().numpy().transpose(1, 2, 0))# * 255).astype(np.uint8)
        upscaled_frame_sr = cv2.cvtColor(upscaled_frame_sr, cv2.COLOR_RGB2BGR)

        # Stack horizontally
        combined = np.concatenate((cropped_image_bicubic/255, upscaled_frame_sr), axis=1)
        
        # image_copy = image.copy()
        cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)
        

        

    # Display the image
    cv2.imshow('BBoxWindow', image)
    cv2.imshow('Cropped frame', combined)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cv2.destroyAllWindows()
    




In [24]:
# img_path = "test_videos/test_image.jpg"
img_path = "test_videos/input_image.png"
image = cv2.imread(img_path)

# Get ROI
# For basket ball image
# cursor_x = 877
# cursor_y = 202

# bb_size = [540,360]

# For Bird image
cursor_x = 650
cursor_y = 252

bb_size = [135,90]

top_left = (cursor_x - bb_size[0]//2, cursor_y - bb_size[1]//2)
bottom_right = (cursor_x + bb_size[0]//2 , cursor_y + bb_size[1]//2)

cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]]
cropped_image_bicubic = cv2.resize(cropped_image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

# Convert frame to tensor
frame_rgb = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)
frame_tensor = transform(frame_rgb).unsqueeze(0).to(device)

with torch.no_grad():
    upscaled_frame_sr = model(frame_tensor)
    upscaled_frame_sr = torch.clamp(upscaled_frame_sr, 0, 1)

# Convert tensor back to numpy array
upscaled_frame_sr = (upscaled_frame_sr.squeeze().cpu().numpy().transpose(1, 2, 0))# * 255).astype(np.uint8)
upscaled_frame_sr = cv2.cvtColor(upscaled_frame_sr, cv2.COLOR_RGB2BGR)

# image_copy = image.copy()
cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

cv2.imwrite("Results/bird_frame.jpg", image)
cv2.imwrite("Results/bird_cropped_frame.jpg", cropped_image)
# cv2.imwrite("Results/bird_bicubic_frame.jpg", (cropped_image_bicubic).astype(np.uint8))
cv2.imwrite("Results/bird_upscaled_frame_sr_swift_sr_gan.jpg", (upscaled_frame_sr*255).astype(np.uint8))

while True:
    # Display the imageq
    cv2.imshow('BBoxWindow', image)
    cv2.imshow('Result frame', upscaled_frame_sr)
    cv2.imshow('Bicubic frame', cropped_image_bicubic)

    

#     # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cv2.destroyAllWindows()
