# ⚡ ResNet-50 + Attention for Facial Emotion Recognition

This notebook:
- Loads pretrained ResNet-50
- Adds spatial attention (CBAM-style)
- Fine-tunes with FER-2013 dataset

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [2]:

# Dataset and transforms
train_dir = 'Data/train'
test_dir = 'Data/test'

train_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(5),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

full_dataset = datasets.ImageFolder(root=train_dir, transform=train_transform)
val_size = int(0.2 * len(full_dataset))
train_size = len(full_dataset) - val_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
val_dataset.dataset.transform = test_transform
test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class_names = full_dataset.classes


In [3]:

class SpatialAttention(nn.Module):
    def __init__(self):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size=7, padding=3)

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x_cat = torch.cat([avg_out, max_out], dim=1)
        attention = torch.sigmoid(self.conv(x_cat))
        return x * attention


In [4]:

# Build ResNet-50 with spatial attention block
class ResNet50WithAttention(nn.Module):
    def __init__(self, num_classes=7):
        super(ResNet50WithAttention, self).__init__()
        self.base_model = models.resnet50(pretrained=True)

        # Remove last FC layer
        self.base_model.fc = nn.Identity()

        # Add attention
        self.attention = SpatialAttention()

        # Final classification layer
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(2048, num_classes)
        )

    def forward(self, x):
        x = self.base_model.conv1(x)
        x = self.base_model.bn1(x)
        x = self.base_model.relu(x)
        x = self.base_model.maxpool(x)

        x = self.base_model.layer1(x)
        x = self.base_model.layer2(x)
        x = self.base_model.layer3(x)
        x = self.base_model.layer4(x)

        x = self.attention(x)
        x = self.base_model.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

model = ResNet50WithAttention().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)




In [5]:

# Train model
num_epochs = 20
train_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    scheduler.step()
    epoch_loss = running_loss / len(train_loader)
    train_losses.append(epoch_loss)

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())

    val_acc = accuracy_score(all_labels, all_preds)
    val_accuracies.append(val_acc)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Val Accuracy: {val_acc:.4f}")


Epoch [1/20], Loss: 1.1676, Val Accuracy: 0.6227
Epoch [2/20], Loss: 0.8948, Val Accuracy: 0.6478
Epoch [3/20], Loss: 0.6952, Val Accuracy: 0.6459
Epoch [4/20], Loss: 0.4783, Val Accuracy: 0.6365
Epoch [5/20], Loss: 0.3060, Val Accuracy: 0.6321


KeyboardInterrupt: 

In [6]:

# Test evaluation
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.numpy())

test_acc = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.6641


In [7]:

# Predict single image
test_image_path = 'my_image2.jpg'

image = Image.open(test_image_path).convert('L').convert('RGB')
image = test_transform(image).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model(image)
    probs = torch.softmax(output, dim=1)
    _, predicted = torch.max(probs, 1)
    predicted_class = class_names[predicted.item()]
    confidence = probs[0][predicted.item()].item()

print(f"Predicted Emotion: {predicted_class} ({confidence*100:.2f}% confidence)")


Predicted Emotion: surprise (87.82% confidence)
