# Action Recognition From Still Imagen Using Deep Learning Networks

Action recognition, the ability to identify and categorize human actions from visual data, has been
a long-standing challenge in the field of computer vision. Traditionally, this task has been tackled
using video footage, where the temporal information provided by consecutive frames allows for a
more robust understanding of the action's dynamics. Recent advances in deep learning have
enabled action recognition to be achieved with impressive accuracy using still images, even in
challenging conditions.

Indeed, everyday human actions like "climbing," "fishing," or "phoning" can also be effectively
described in still images. Furthermore, certain actions captured in videos, such as "taking photos,"
are inherently static and may require recognition methods solely based on static cues. Driven by
the potential implications of recognizing actions in still images and the comparative neglect of this
problem in computer vision, this assignment delves into the recognition of human actions utilizing
a single photograph.

For this project, the accompanying dataset encompasses a training set and a test set,
encompassing actions across 40 distinct categories. The Stanford 40 Action Dataset comprises
images depicting individuals executing 40 different actions. For each image, we provide a
bounding box surrounding the person performing the action, as indicated by the image's filename.
The dataset comprises 9532 images in total, with 180-300 images per action category. The
dataset is attached to this file for your convenience.

## 1. Data Loader to read the training and testing sets from the Standford 40 dataset

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm

# Check device availability more efficiently
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
print(f"Using device: {device_name}")


Using device: NVIDIA GeForce RTX 3060 Laptop GPU


In [4]:
import DataLoader as DL

data_loader = DL.DataLoader(base_dir="./Stanford40")
dataset = data_loader.prepare_dataset()

print(f"Number of training images: {len(dataset['train'])}")
print(f"Number of testing images: {len(dataset['test'])}")



Number of training images: 4000
Number of testing images: 5532


## Custom CNN

In [14]:
# Dataset personalizado con soporte para boxes
class Stanford40DatasetWithBoxes(Dataset):
    def __init__(self, dataset, split, transform=None):
        self.data = dataset[split]
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item["filename"]).convert("RGB")
        label = class_to_idx[item["action"]]
        
        # Recortar la imagen usando el bounding box
        bndbox = item["bndbox"]
        image = image.crop((bndbox["xmin"], bndbox["ymin"], bndbox["xmax"], bndbox["ymax"]))
        
        # Aplicar transformaciones
        if self.transform:
            image = self.transform(image)

        return image, label

# Crear DataLoaders con el dataset que incluye boxes
train_dataset = Stanford40DatasetWithBoxes(dataset, split="train", transform=transform)
test_dataset = Stanford40DatasetWithBoxes(dataset, split="test", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Dataset con boxes (entrenamiento): {len(train_loader)} batches")
print(f"Dataset con boxes (prueba): {len(test_loader)} batches")

Dataset con boxes (entrenamiento): 125 batches
Dataset con boxes (prueba): 173 batches


In [15]:
# Modelo basado en ResNet
class CustomResNet(nn.Module):
    def __init__(self, num_classes):
        super(CustomResNet, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet(x)

model = CustomResNet(num_classes=len(actions)).to(device)
print(model)


CustomResNet(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.resnet.fc.parameters(), lr=0.001)

# Entrenamiento
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer, num_epochs=10)


100%|██████████| 125/125 [00:14<00:00,  8.67it/s]


Epoch [1/10], Loss: 1.7664


100%|██████████| 125/125 [00:14<00:00,  8.76it/s]


Epoch [2/10], Loss: 1.5148


100%|██████████| 125/125 [00:14<00:00,  8.79it/s]


Epoch [3/10], Loss: 1.3502


100%|██████████| 125/125 [00:14<00:00,  8.78it/s]


Epoch [4/10], Loss: 1.2502


100%|██████████| 125/125 [00:14<00:00,  8.80it/s]


Epoch [5/10], Loss: 1.1726


100%|██████████| 125/125 [00:14<00:00,  8.71it/s]


Epoch [6/10], Loss: 1.0895


100%|██████████| 125/125 [00:14<00:00,  8.76it/s]


Epoch [7/10], Loss: 1.0301


100%|██████████| 125/125 [00:14<00:00,  8.63it/s]


Epoch [8/10], Loss: 0.9871


100%|██████████| 125/125 [00:14<00:00,  8.69it/s]


Epoch [9/10], Loss: 0.9517


100%|██████████| 125/125 [00:14<00:00,  8.71it/s]

Epoch [10/10], Loss: 0.9259





In [18]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Precisión en el conjunto de prueba: {100 * correct / total:.2f}%")

evaluate_model(model, test_loader)


Precisión en el conjunto de prueba: 56.89%


In [19]:
import torch.nn.functional as F

# Modelo CNN personalizado
class CustomCNN(nn.Module):
    def __init__(self, num_classes):
        super(CustomCNN, self).__init__()
        # Bloque 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)  # Entrada RGB
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bloque 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bloque 3
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Capa completamente conectada
        self.fc1 = nn.Linear(128 * 28 * 28 // (2**3), 512)  # Ajustar según el tamaño final
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)  # Aplanar
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instanciar el modelo
num_classes = len(actions)  # 40 clases
custom_model = CustomCNN(num_classes=num_classes).to(device)
print(custom_model)


CustomCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=12544, out_features=512, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=40, bias=True)
)


In [21]:
class CustomCNN(nn.Module):
    def __init__(self, num_classes):
        super(CustomCNN, self).__init__()
        # Bloque 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bloque 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bloque 3
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Calcular automáticamente el tamaño para la capa lineal
        self._to_linear = None
        self.calculate_linear_input((3, 224, 224))  # Tamaño de la imagen de entrada

        self.fc1 = nn.Linear(self._to_linear, 512)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def calculate_linear_input(self, input_size):
        # Simular una pasada hacia adelante para calcular el tamaño de la salida
        x = torch.zeros(1, *input_size)
        x = self.pool1(self.conv1(x))
        x = self.pool2(self.conv2(x))
        x = self.pool3(self.conv3(x))
        self._to_linear = x.numel()

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)  # Aplanar
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [23]:
# Función para entrenar el modelo personalizado
def train_custom_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()  # Modo de entrenamiento
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader):  # Iterar sobre el DataLoader
            images, labels = images.to(device), labels.to(device)

            # Reiniciar los gradientes
            optimizer.zero_grad()

            # Paso hacia adelante
            outputs = model(images)

            # Calcular la pérdida
            loss = criterion(outputs, labels)

            # Paso hacia atrás y optimización
            loss.backward()
            optimizer.step()

            # Acumular la pérdida
            running_loss += loss.item()

        # Imprimir pérdida por época
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")


In [24]:
# Función para evaluar el modelo
def evaluate_custom_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Precisión del modelo personalizado en el conjunto de prueba: {100 * correct / total:.2f}%")

# Instanciar el modelo
num_classes = len(actions)  # 40 clases
custom_model = CustomCNN(num_classes=num_classes).to(device)

# Reentrenar el modelo
train_custom_model(custom_model, train_loader, criterion, optimizer, num_epochs=10)

# Evaluar el modelo
evaluate_custom_model(custom_model, test_loader)


100%|██████████| 125/125 [00:18<00:00,  6.84it/s]


Epoch [1/10], Loss: 3.7547


100%|██████████| 125/125 [00:17<00:00,  6.98it/s]


Epoch [2/10], Loss: 3.7569


100%|██████████| 125/125 [00:17<00:00,  7.01it/s]


Epoch [3/10], Loss: 3.7583


100%|██████████| 125/125 [00:18<00:00,  6.94it/s]


Epoch [4/10], Loss: 3.7620


100%|██████████| 125/125 [00:17<00:00,  7.00it/s]


Epoch [5/10], Loss: 3.7560


100%|██████████| 125/125 [00:17<00:00,  7.01it/s]


Epoch [6/10], Loss: 3.7607


100%|██████████| 125/125 [00:17<00:00,  6.99it/s]


Epoch [7/10], Loss: 3.7590


100%|██████████| 125/125 [00:17<00:00,  7.00it/s]


Epoch [8/10], Loss: 3.7523


100%|██████████| 125/125 [00:17<00:00,  7.01it/s]


Epoch [9/10], Loss: 3.7506


100%|██████████| 125/125 [00:18<00:00,  6.82it/s]


Epoch [10/10], Loss: 3.7575
Precisión del modelo personalizado en el conjunto de prueba: 2.24%
