In [12]:
import os
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchvision import transforms, datasets
from PIL import Image
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm

In [45]:

"""

NOTES:
transformations have 2 purposes:
1. data augmentation — used during training to improve generalization and reduce overfitting
2. standardization — used in all phases (train/val/test) to normalize pixel values and convert to tensors
For the first we should use:
transforms.ToTensor() - train/test - converts PILImage to tensor, scales to [0,1]
For the second we should use:
transforms.Normalize(mean, std) - train/test - helps network converge faster, stabilizes training
grayscale images - (mean, std) - ((0.5,), (0.5,))
color images - (mean, std) - ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

Customize the rest:
1. Based on image type
grayscale images	            simple transforms (no color jittering)
color images	                add ColorJitter, RandomGrayscale, etc.
medical/scientific	            avoid strong augmentations
natural photos	                more aggressive augmentation is okay

2. Based on image resolution
small (e.g. 48x48)	            avoid aggressive cropping or resizing
large (e.g. 224x224 or more)	you can use RandomCrop, Resize, etc.

3. Based on task type
classification	                horizontal flip, rotation, crop
object detection                use bounding-box aware augmentations
segmentation	                need to apply the same transform to mask
facial emotion recognition	    very limited changes — don’t alter facial structure

4. Based on dataset size
small	                        apply more augmentation to simulate more data
large	                        minimal augmentation, rely on real variation

"""

class FER2013Dataset:
    # constructor method used to initialize the dataset loader class
    # params: base_path (path where the dataset is located)
    # params: batch_size (number of samples per batch to load during training)
    def __init__(self, base_path='/Users/mariakalianova/PycharmProjects/PythonProject/CNN/archive', batch_size=64):
        # ------------------------------------------------
        # DEFINING MAIN PARAMS
        # ------------------------------------------------
        # build full paths to the training and test (used as validation) directories
        self.train_dir = os.path.join(base_path, 'train')
        self.val_dir = os.path.join(base_path, 'test')
        self.batch_size = batch_size

        # ------------------------------------------------
        # INTRODUCE TRANSFORMATIONS FOR TEST AND TRAIN DATASET
        # ------------------------------------------------
        # training transforms include:
        # - random horizontal flip (for data augmentation, so it works on 'mirrored' images)
        # - random rotation (for robustness to small image rotations)
        # - tensor conversion (turns PIL image into torch tensor)
        # - normalization (scales pixel values to [-1, 1] range)
        self.train_transforms = transforms.Compose([
            transforms.Grayscale(num_output_channels=1),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))  # normalization for grayscale images
        ])

        # define preprocessing for validation images
        # no augmentation is used here to keep evaluation consistent
        self.val_transforms = transforms.Compose([
            transforms.Grayscale(num_output_channels=1),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])

        # ------------------------------------------------
        # CREATE TRANSFORMED DATASETS
        # ------------------------------------------------
        # create the training and testing dataset using the folder structure and training transforms
        self.train_dataset = ImageFolder(root=self.train_dir, transform=self.train_transforms)
        self.val_dataset = ImageFolder(root=self.val_dir, transform=self.val_transforms)

        # ------------------------------------------------
        # WRAP TRANSFORMED DATASETS
        # ------------------------------------------------
        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        self.val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

    # returns training and validation dataloaders
    # access to preloaded dataloaders for use in training and evaluation loops
    def get_loaders(self):
        return self.train_loader, self.val_loader

    # returns a list of class names
    def get_classes(self):
        return self.train_dataset.classes

    # returns a dictionary mapping class names to numeric labels
    def get_class_to_idx(self):
        return self.train_dataset.class_to_idx

    # returns number of samples in training dataset
    def __len__(self):
        return len(self.train_dataset)


In [49]:
dat = FER2013Dataset()
print(dat.get_class_to_idx())

{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}


In [16]:
class EmotionRecognitionCNN(nn.Module):
    def __init__(self):
        super(EmotionRecognitionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(256 * 6 * 6, 1024)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, 7)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 48 -> 24
        x = self.pool(F.relu(self.conv2(x)))  # 24 -> 12
        x = self.pool(F.relu(self.conv3(x)))  # 12 -> 6
        x = x.view(-1, 256 * 6 * 6)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [18]:
model = EmotionRecognitionCNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [20]:

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
dataset = FER2013Dataset(base_path="/Users/mariakalianova/PycharmProjects/PythonProject/CNN/archive", batch_size=64)
train_loader, val_loader = dataset.get_loaders()

# Initialize model
model = EmotionRecognitionCNN().to(device)

# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch")

    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Accuracy tracking
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        progress_bar.set_postfix(
            loss=running_loss / len(train_loader),
            accuracy=correct_predictions / total_predictions
        )

    avg_loss = running_loss / len(train_loader)
    avg_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}")


Epoch 1/10: 100%|█| 449/449 [01:07<00:00,  6.61batch/s, accuracy=0.339, loss=1.6


Epoch 1/10 - Loss: 1.6540, Accuracy: 0.3389


Epoch 2/10: 100%|█| 449/449 [01:06<00:00,  6.76batch/s, accuracy=0.45, loss=1.43


Epoch 2/10 - Loss: 1.4331, Accuracy: 0.4502


Epoch 3/10: 100%|█| 449/449 [01:05<00:00,  6.80batch/s, accuracy=0.496, loss=1.3


Epoch 3/10 - Loss: 1.3168, Accuracy: 0.4960


Epoch 4/10: 100%|█| 449/449 [01:06<00:00,  6.77batch/s, accuracy=0.526, loss=1.2


Epoch 4/10 - Loss: 1.2539, Accuracy: 0.5259


Epoch 5/10: 100%|█| 449/449 [01:05<00:00,  6.84batch/s, accuracy=0.542, loss=1.2


Epoch 5/10 - Loss: 1.2048, Accuracy: 0.5419


Epoch 6/10: 100%|█| 449/449 [01:05<00:00,  6.88batch/s, accuracy=0.556, loss=1.1


Epoch 6/10 - Loss: 1.1648, Accuracy: 0.5563


Epoch 7/10: 100%|█| 449/449 [01:04<00:00,  6.97batch/s, accuracy=0.573, loss=1.1


Epoch 7/10 - Loss: 1.1228, Accuracy: 0.5734


Epoch 8/10: 100%|█| 449/449 [01:05<00:00,  6.89batch/s, accuracy=0.585, loss=1.1


Epoch 8/10 - Loss: 1.0989, Accuracy: 0.5849


Epoch 9/10: 100%|█| 449/449 [01:05<00:00,  6.82batch/s, accuracy=0.593, loss=1.0


Epoch 9/10 - Loss: 1.0755, Accuracy: 0.5932


Epoch 10/10: 100%|█| 449/449 [01:07<00:00,  6.66batch/s, accuracy=0.606, loss=1.

Epoch 10/10 - Loss: 1.0424, Accuracy: 0.6058





In [22]:
# Evaluate on test data
model.eval()  # Set the model to evaluation mode
val_loss = 0.0
val_correct = 0
val_total = 0

In [25]:
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        val_correct += (predicted == labels).sum().item()
        val_total += labels.size(0)

val_accuracy = val_correct / val_total
val_loss_avg = val_loss / len(val_loader)

print(f"Validation Loss: {val_loss_avg:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Validation Loss: 2.1968, Validation Accuracy: 0.5893


In [29]:
torch.save(model.state_dict(), "emotion_model2_58.pth")
print("Model saved as 'emotion_model2.pth'")

Model saved as 'emotion_model2.pth'


In [31]:
model = EmotionRecognitionCNN()
model.load_state_dict(torch.load("emotion_model2_58.pth", map_location=device))
model.to(device)
model.eval()

EmotionRecognitionCNN(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=9216, out_features=1024, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=1024, out_features=7, bias=True)
)

In [58]:
# Instantiate your dataset object
dataset = FER2013Dataset(base_path="/Users/mariakalianova/PycharmProjects/PythonProject/CNN/archive")

# Get mapping: {'angry': 0, ..., 'surprise': 6}
class_to_idx = dataset.get_class_to_idx()

# Reverse it: {0: 'angry', ..., 6: 'surprise'}
emotion_dict = {v: k for k, v in class_to_idx.items()}

# Preprocessing: match training
preprocess = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((48, 48)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Start webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)

    for (x, y, w, h) in faces:
        face = gray[y:y+h, x:x+w]
        face_pil = Image.fromarray(face)
        face_tensor = preprocess(face_pil).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(face_tensor)
            prediction = torch.argmax(output, 1).item()
            emotion = emotion_dict[prediction]

        # Draw box and label
        cv2.reqctangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
        cv2.putText(frame, emotion, (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

    cv2.imshow('Real-Time Emotion Recognition', frame)

    # Exit on 'q' key
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

In [1]:
import seaborn as sns
from sklearn.metrics import confusion_matrix


conf_mat = confusion_matrix(true_labels, predicted_labels)

# confusion matrix
fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', 
            xticklabels=train_data.classes, yticklabels=train_data.classes, ax=ax)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

NameError: name 'true_labels' is not defined