# Improved People Counting with Density Maps
This notebook demonstrates an improved approach for counting people in images by switching from bounding box detection to density map estimation. This method is more effective for counting tasks, especially with a limited dataset.

**Key Improvements:**
1.  **Task Formulation**: Changed from single-object detection to density map regression.
2.  **Data Preprocessing**: A new `Dataset` class generates ground truth density maps from head coordinates provided in JSON files.
3.  **Model Architecture**: Replaced the simple detector with a Fully Convolutional Network (FCN) suitable for producing density maps.
4.  **Loss Function**: Using Mean Squared Error (MSE) between the predicted and ground truth density maps.
5.  **Evaluation**: Implemented Mean Absolute Error (MAE) to measure counting accuracy.

## 1. Setup and Imports

In [None]:
import numpy as np
import cv2
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from scipy.ndimage import gaussian_filter
import warnings

warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Dataset and Density Map Generation
We define a custom dataset that reads images and their corresponding JSON labels. For each image, it generates a ground truth density map by applying a Gaussian kernel to each annotated head position.

In [None]:
class CrowdCountingDataset(Dataset):
    def __init__(self, image_dir, label_dir, max_samples=50, img_size=224, downscale_factor=16, sigma=4):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.img_size = img_size
        self.downscale_factor = downscale_factor
        self.sigma = sigma

        all_images = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])
        self.image_files = all_images[:max_samples]
        self.labels = self._load_labels()

        print(f"Using {len(self.image_files)} images for training.")

    def _load_labels(self):
        labels = {}
        for img_file in self.image_files:
            label_file = img_file.replace('.jpg', '.json')
            label_path = os.path.join(self.label_dir, label_file)
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    data = json.load(f)
                    labels[img_file] = data
        return labels

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)

        # Load and resize image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        orig_h, orig_w = image.shape[:2]
        image = cv2.resize(image, (self.img_size, self.img_size))
        image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0

        # Generate density map
        output_size = self.img_size // self.downscale_factor
        density_map = np.zeros((output_size, output_size), dtype=np.float32)

        label_data = self.labels.get(img_name)
        if label_data and label_data['human_num'] > 0:
            points = np.array(label_data['points'])
            for x, y in points:
                # Scale coordinates to the output map size
                scaled_x = int((x / orig_w) * output_size)
                scaled_y = int((y / orig_h) * output_size)
                if 0 <= scaled_x < output_size and 0 <= scaled_y < output_size:
                    density_map[scaled_y, scaled_x] = 1.0

        # Apply Gaussian filter to create the density map
        density_map = gaussian_filter(density_map, sigma=self.sigma / self.downscale_factor)
        density_map_tensor = torch.from_numpy(density_map).unsqueeze(0) # Add channel dimension

        return image_tensor, density_map_tensor

## 3. Fully Convolutional Network (FCN) for Density Estimation
This FCN model uses a simple CNN backbone to extract features and then a `1x1` convolution to produce a single-channel density map. The output map is 1/16th the size of the input image.

In [None]:
class DensityFCN(nn.Module):
    def __init__(self):
        super(DensityFCN, self).__init__()
        # Backbone: Feature extractor
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 224 -> 112
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 112 -> 56
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 56 -> 28
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 28 -> 14
        )
        # Head: Density map predictor
        self.head = nn.Conv2d(128, 1, 1) # 1x1 conv to produce 1-channel map

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        # Use ReLU to ensure density is non-negative
        return torch.relu(x)

# Initialize model
model = DensityFCN().to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print("Model architecture:")
print(model)

## 4. Load Data and Set Up Training

In [None]:
# Define paths (assuming data is in a 'data' folder in the project root)
BASE_DIR = '../data/penyisihan-hology-8-0-2025-data-mining'
TRAIN_IMG_DIR = os.path.join(BASE_DIR, 'train', 'images')
TRAIN_LBL_DIR = os.path.join(BASE_DIR, 'train', 'labels')
TEST_IMG_DIR = os.path.join(BASE_DIR, 'test', 'images')

# Create dataset and dataloader
train_dataset = CrowdCountingDataset(
    image_dir=TRAIN_IMG_DIR,
    label_dir=TRAIN_LBL_DIR,
    max_samples=50, # Using 50 samples as requested
    img_size=224,
    downscale_factor=16
)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Loss and Optimizer
criterion = nn.MSELoss() # Pixel-wise MSE for density map regression
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## 5. Training Loop

In [None]:
num_epochs = 100 # Increased epochs for better convergence on small dataset
training_losses = []

print("Starting training...")
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for images, density_maps in train_loader:
        images = images.to(device)
        density_maps = density_maps.to(device)

        # Forward pass
        optimizer.zero_grad()
        pred_density_maps = model(images)

        # Calculate loss
        loss = criterion(pred_density_maps, density_maps)

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    training_losses.append(avg_loss)

    if (epoch + 1) % 20 == 0 or epoch == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}')

print("Training completed!")

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(training_losses)
plt.title('Training Loss Over Time')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.grid(True)
plt.show()

## 6. Evaluation on Test Set
We now evaluate the trained model on the test images. We define a function to predict the count for a single image and then calculate the Mean Absolute Error (MAE) across a sample of the test set.

In [None]:
def predict_count(model, image_path, device, img_size=224):
    model.eval()
    
    # Load and preprocess image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized_image = cv2.resize(image, (img_size, img_size))
    input_tensor = torch.from_numpy(resized_image).permute(2, 0, 1).float().unsqueeze(0) / 255.0
    input_tensor = input_tensor.to(device)

    with torch.no_grad():
        pred_density_map = model(input_tensor)

    # The predicted count is the sum of the density map
    predicted_count = pred_density_map.sum().item()
    
    return image, predicted_count, pred_density_map.squeeze().cpu().numpy()

def get_ground_truth_count(image_name, label_dir):
    label_file = image_name.replace('.jpg', '.json')
    label_path = os.path.join(label_dir, label_file)
    if not os.path.exists(label_path):
        # For test set, we might not have labels. We'll use train labels for MAE calculation.
        return 0
    with open(label_path, 'r') as f:
        data = json.load(f)
    return data['human_num']

In [None]:
print("Evaluating model on training images (to check learning)...")
print("=" * 50)

evaluation_results = []
sample_images_for_eval = train_dataset.image_files[:10] # Use first 10 training images for eval

for img_name in sample_images_for_eval:
    img_path = os.path.join(TRAIN_IMG_DIR, img_name)
    
    # Get ground truth
    true_count = get_ground_truth_count(img_name, TRAIN_LBL_DIR)
    
    # Get prediction
    original_image, pred_count, pred_map = predict_count(model, img_path, device)
    
    evaluation_results.append({
        'image_name': img_name,
        'true_count': true_count,
        'pred_count': pred_count
    })
    
    # Visualize first 3 images
    if len(evaluation_results) <= 3:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
        ax1.imshow(original_image)
        ax1.set_title(f'Original Image: {img_name}')
        ax1.axis('off')
        
        im = ax2.imshow(pred_map, cmap='jet')
        ax2.set_title(f'Predicted Density Map (Count: {pred_count:.2f})\nGround Truth: {true_count}')
        ax2.axis('off')
        fig.colorbar(im, ax=ax2)
        plt.show()

# Calculate and display MAE
eval_df = pd.DataFrame(evaluation_results)
mae = (eval_df['pred_count'] - eval_df['true_count']).abs().mean()

print("\nEvaluation Results on Training Sample:")
print(eval_df)
print(f"\nMean Absolute Error (MAE): {mae:.4f}")

## 7. Final Submission Generation
Finally, we run predictions on the actual test set and generate a `submission.csv` file in the required format.

In [None]:
print("Generating submission file for the test set...")

test_images = sorted([f for f in os.listdir(TEST_IMG_DIR) if f.endswith('.jpg')])
submission_data = []

for img_name in test_images:
    img_path = os.path.join(TEST_IMG_DIR, img_name)
    _, pred_count, _ = predict_count(model, img_path, device)
    
    submission_data.append({
        'id': img_name,
        'human_num': int(round(pred_count))
    })

submission_df = pd.DataFrame(submission_data)
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")
print("\nFirst 5 rows of submission:")
print(submission_df.head())