In [None]:
# %pip install numpy matplotlib scipy pandas scikit-learn seaborn statsmodels torch torchvision transformers opencv-python Pillow

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import math
import time
import scipy
import cv2
from PIL import Image
import random
import os
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
import torchvision
from torchvision import transforms
from torchvision.models import resnet50
from transformers import TransformerEncoder, TransformerDecoder
from transformers import TransformerEncoderLayer, TransformerDecoderLayer

In [None]:
class CrowdDataset(Dataset):
    """
    A custom dataset class for crowd counting, which handles loading and preprocessing of images and annotations.

    Attributes:
        image_paths (list): A list of paths to the image files.
        annotation_paths (list): A list of paths to the annotation files.
        crop_size (tuple): The target size (height, width) for random cropping.
        scale (tuple): The range of scaling factors for random scaling.
        flip_prob (float): Probability of horizontally flipping the image.
        max_size (int): Maximum size for resizing the images during testing.
        phase (str): The current phase of the dataset, 'train' or 'test'.
    """
    def __init__(self, image_paths, annotation_paths, crop_size, scale, flip_prob, max_size, phase):
        """
        Initializes the CrowdDataset object with the necessary parameters and setups up the transformation pipeline.
        
        Parameters:
            image_paths (list of str): Paths to the image files.
            annotation_paths (list of str): Paths to the annotation files.
            crop_size (tuple of int): Desired output size of the cropped images as (height, width).
            scale (tuple of float): The range (min, max) of scaling factors for image scaling.
            flip_prob (float): Probability of flipping the image horizontally during augmentation.
            max_size (int): Maximum size of the images during testing for resizing.
            phase (str): Specifies the dataset phase ('train' or 'test') to tailor transformations accordingly.
        """
        # Store the paths to the images and annotations
        self.image_paths = image_paths
        self.annotation_paths = annotation_paths

        # Store image processing parameters
        self.crop_size = crop_size  # Desired crop size for images
        self.scale = scale  # Range of scale for scaling the images
        self.flip_prob = flip_prob  # Probability of flipping an image
        self.max_size = max_size  # Maximum size for resizing in the test phase
        self.phase = phase  # Dataset phase (training or testing)

        # Set up a sequence of transformations for image preprocessing
        # This includes converting images to tensors and normalizing pixel values
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # Convert images to PyTorch tensors
            transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize using the mean and std of the ImageNet dataset
                                std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        """
        Returns the total number of images in the dataset.

        Returns:
            int: The number of images available in the dataset.
        """
        # Return the length of the list that contains image paths, representing the number of images
        return len(self.image_paths)

    
    def __getitem__(self, idx):
        """
        Retrieve an image and its corresponding annotations at a specified index with appropriate preprocessing applied.

        Parameters:
            idx (int): The index of the image and annotations to retrieve from the dataset.

        Returns:
            tuple: A tuple containing the processed image as a tensor and the corresponding points as a tensor.
        """
        # Open the image file at the given index, convert it to an RGB image
        image = Image.open(self.image_paths[idx]).convert("RGB")
        # Load the annotation points for the corresponding image
        points = np.load(self.annotation_paths[idx])
        
        # Check the phase of the dataset to determine the preprocessing steps
        if self.phase == 'train':
            # In training phase, perform random cropping, scaling, and flipping
            image, points = self.random_crop(image, points, self.crop_size)
            image, points = self.random_scale(image, points, self.scale)
            image, points = self.random_flip(image, points, self.flip_prob)
        elif self.phase == 'test':
            # In testing phase, only resize the image to maintain consistency
            image, points = self.resize_image(image, points, self.max_size)

        # Apply transformations to the image (e.g., normalization, conversion to tensor)
        image = self.transform(image)

        # Convert points to a tensor with the appropriate data type
        return image, torch.tensor(points, dtype=torch.float32)

    
    def random_crop(self, image, points, crop_size):
        """
        Randomly crops the image to a specified size and adjusts the annotation points to fit the new image dimensions.

        Parameters:
            image (PIL.Image): The image to be cropped.
            points (numpy.ndarray): Array of annotation points associated with the image.
            crop_size (tuple of int): The desired height and width (height, width) to crop the image to.

        Returns:
            tuple: The cropped image and the adjusted points within the new image dimensions.
        """
        # Determine the current width and height of the image
        w, h = image.size
        new_h, new_w = crop_size  # Unpack the desired crop dimensions
        
        # Randomly select a top coordinate for cropping if the current height is greater than the desired height
        if h > new_h:
            top = np.random.randint(0, h - new_h)
        else:
            top = 0  # If the image is smaller than or equal to the desired crop height, start at the top
        
        # Randomly select a left coordinate for cropping if the current width is greater than the desired width
        if w > new_w:
            left = np.random.randint(0, w - new_w)
        else:
            left = 0  # If the image is smaller than or equal to the desired crop width, start at the left

        # Crop the image from the calculated top and left points to the new width and height
        image = image.crop((left, top, left + new_w, top + new_h))
        # Adjust the points by subtracting the left and top offset, effectively shifting them to the new coordinate system
        points = points - [left, top]
        # Filter out points that are now outside the bounds of the new image dimensions
        points = points[(points[:, 0] >= 0) & (points[:, 1] >= 0) & (points[:, 0] < new_w) & (points[:, 1] < new_h)]
        
        return image, points

    
    def random_scale(self, image, points, scale_range):
        """
        Randomly scales the image and its associated annotation points according to a specified scale range.

        Parameters:
            image (PIL.Image): The image to be scaled.
            points (numpy.ndarray): Array of annotation points associated with the image.
            scale_range (tuple of float): A tuple containing the minimum and maximum scaling factors.

        Returns:
            tuple: The scaled image and the adjusted points as per the scale factor applied.
        """
        # Randomly choose a scale factor within the given range
        scale_factor = random.uniform(*scale_range)
        
        # Apply the affine transformation to scale the image without rotation or shearing
        image = transforms.functional.affine(image, angle=0, translate=(0, 0), scale=scale_factor, shear=0)
        
        # Scale the annotation points to match the new image size
        points *= scale_factor
        
        return image, points

    
    def random_flip(self, image, points, prob):
        """
        Horizontally flips the image with a given probability and adjusts the annotation points accordingly.

        Parameters:
            image (PIL.Image): The image to potentially flip.
            points (numpy.ndarray): Array of annotation points associated with the image.
            prob (float): Probability of flipping the image horizontally.

        Returns:
            tuple: The possibly flipped image and the appropriately adjusted points.
        """
        # Check if the image should be flipped based on a random probability
        if random.random() < prob:
            # Horizontally flip the image
            image = transforms.functional.hflip(image)
            # Get the width of the image to calculate the new positions of the points
            w, _ = image.size
            # Reflect the x-coordinates of the points across the width of the image
            points[:, 0] = w - points[:, 0]
        
        return image, points


    def resize_image(self, image, points, max_size):
        """
        Resizes the image to ensure its largest dimension does not exceed the specified maximum size, 
        and adjusts the annotation points accordingly.

        Parameters:
            image (PIL.Image): The image to be resized.
            points (numpy.ndarray): Array of annotation points associated with the image.
            max_size (int): The maximum size that the image's largest dimension should not exceed.

        Returns:
            tuple: The resized image and the adjusted points.
        """
        # Get current width and height of the image
        w, h = image.size
        # Check if the largest dimension of the image exceeds the maximum size
        if max(h, w) > max_size:
            # Calculate the ratio to scale down to the maximum size
            ratio = max_size / max(h, w)
            # Resize the image using the computed ratio
            image = transforms.functional.resize(image, (int(h * ratio), int(w * ratio)))
            # Adjust the points by the same ratio to match the new image size
            points *= ratio

        return image, points

In [None]:
def create_resnet_backbone(pretrained=True):
    """
    Creates a modified ResNet50 model to be used as a backbone in other models, 
    where the final fully connected layer and the average pooling layer are removed.

    Parameters:
        pretrained (bool): If True, loads a ResNet50 model pre-trained on ImageNet. 
                           If False, initializes a new ResNet50 model without pre-trained weights.

    Returns:
        torch.nn.Sequential: The modified ResNet50 model without the final classification layers.
    """
    # Load the pre-trained ResNet50 model if specified, otherwise load a default new model
    model = resnet50(pretrained=pretrained)
    # Remove the final two layers (average pooling and fully connected layer) to use as a feature extractor
    backbone = torch.nn.Sequential(*(list(model.children())[:-2]))
    return backbone

In [None]:
class PositionalEncoding(nn.Module):
    """
    Adds positional encodings to the input embeddings to introduce a notion of word order.

    Attributes:
        d_model (int): The dimensionality of the input embeddings.
        max_len (int): The maximum length of the input sequences for which positional encodings will be generated.

    Methods:
        forward(feature_size): Applies positional encoding up to the specified feature size.
    """
    def __init__(self, d_model, max_len=5000):
        """
        Initializes the PositionalEncoding module with a specific embedding dimension and maximum sequence length.

        Parameters:
            d_model (int): The dimensionality of the model's input embeddings.
            max_len (int): The maximum length of the sequences for which positional encodings are to be created.
        """
        super(PositionalEncoding, self).__init__()
        # Initialize a zero matrix for positional encodings
        self.encoding = torch.zeros(max_len, d_model)
        # Generate a position array from 0 to max_len
        position = torch.arange(0, max_len).unsqueeze(1)
        # Calculate the division term for the encoding formula
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        # Apply sine to even indices in the positional encoding matrix
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices in the positional encoding matrix
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, feature_size):
        """
        Retrieve the positional encoding for the first 'feature_size' positions.

        Parameters:
            feature_size (int): The number of positions to retrieve encodings for, 
                                typically the length of the input sequences.

        Returns:
            torch.Tensor: The positional encodings for the specified feature size.
        """
        # Return positional encodings up to the requested feature size
        return self.encoding[:, :feature_size]

In [None]:
class CrowdTransformer(nn.Module):
    """
    A transformer-based model designed for tasks that require encoding and decoding capabilities, enhanced
    with positional encoding to maintain sequence order awareness.

    Attributes:
        d_model (int): The number of expected features in the transformer's input and output.
        nhead (int): The number of heads in the multihead attention models.
        num_encoder_layers (int): The number of sub-encoder-layers in the transformer encoder.
        num_decoder_layers (int): The number of sub-decoder-layers in the transformer decoder.
    """

    def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6):
        """
        Initializes the CrowdTransformer model with specified configurations for the encoder and decoder.

        Parameters:
            d_model (int): The number of expected features in the input (also the size of embeddings).
            nhead (int): The number of heads in the multihead attention mechanism.
            num_encoder_layers (int): The number of transformer encoder layers.
            num_decoder_layers (int): The number of transformer decoder layers.
        """
        super(CrowdTransformer, self).__init__()
        # Define the encoder layer and repeat it 'num_encoder_layers' times in the encoder
        encoder_layers = TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_encoder_layers)
        
        # Define the decoder layer and repeat it 'num_decoder_layers' times in the decoder
        decoder_layers = TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = TransformerDecoder(decoder_layers, num_decoder_layers)
        
        # Add positional encoding to inject some information about the relative or absolute position of the tokens
        self.positional_encoding = PositionalEncoding(d_model)

    def forward(self, src, queries, src_key_padding_mask=None, tgt_key_padding_mask=None):
        """
        Processes the input through the transformer model to generate outputs based on source and target queries.

        Parameters:
            src (Tensor): The sequence to the encoder (source sequence).
            queries (Tensor): The sequence to the decoder (target queries).
            src_key_padding_mask (Tensor, optional): The mask for the src keys per batch (optional).
            tgt_key_padding_mask (Tensor, optional): The mask for the tgt keys per batch (optional).

        Returns:
            Tensor: The output from the transformer decoder.
        """
        # Apply positional encoding to the source sequence
        src = self.positional_encoding(src)
        # Encode the source sequence
        memory = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        # Decode the encoded source along with the target queries
        output = self.transformer_decoder(queries, memory, tgt_key_padding_mask=tgt_key_padding_mask)
        
        return output

In [None]:
class KMOMatcher(nn.Module):
    # Your implementation here
    pass

In [None]:
class CrowdLocalizationLoss(nn.Module):
    # Your implementation here
    pass

In [None]:
class CLTR(nn.Module):
    def __init__(self, num_queries=500, ...):
        super(CLTR, self).__init__()
        self.backbone = create_resnet_backbone()
        self.transformer = CrowdTransformer(...)
        self.matcher = KMOMatcher(...)
        self.loss = CrowdLocalizationLoss(...)
        
    def forward(self, images, targets=None):
        # Feature extraction
        features = self.backbone(images)
        
        # Flatten feature maps and combine with positional encodings
        
        # Prepare queries
        
        # Transformer forward pass
        
        # If training, use matcher and calculate loss
        if self.training:
            # Match predictions to ground truth
            # Calculate loss
            return loss
        else:
            # Return predictions for evaluation
            return predictions
        
    # Implement additional methods if necessary, e.g., fit, predict
