# Architecture A

**Architecture Specifications:**
- **Image Input**: 100×100 grayscale → 224×224 RGB (ResNet-18 compatible)
- **Image Backbone**: ResNet-18 (pretrained) → 512-D image feature
- **Text Input**: Short text metadata (tokenized)
- **Text Encoder**: GRU (RNN) → 512-D text embedding
- **Fusion**: Concatenate [512-D image, 512-D text] → 1024-D
- **Dropout**: p=0.5 (randomly drops 50% of fused features during training)
- **Head**: Linear (1024 → 7), Softmax for probabilities
- **Loss**: Cross-Entropy


In [49]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import random
import re
from collections import Counter
from tqdm.notebook import tqdm
from datasets import load_dataset
import matplotlib.pyplot as plt
from PIL import Image
import os


## Dataset Setup

In [50]:
# Image Dataset Setup
IMG_DATASET_LOCATION = '../../../Technical/datasets/archive/RAFDB'

# Load Hugging Face emotion dataset (text)
emotion_dataset = load_dataset("emotion")
df_emotion = emotion_dataset['train'].to_pandas()
print(f"✓ Loaded text emotion dataset: {len(df_emotion)} samples")

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    #transforms.Grayscale(num_output_channels=3),  # Convert grayscale to RGB
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

label_dict = {0: "Surprise", 1: "Fear", 2: "Disgust", 3: "Happiness", 4: "Sadness", 5: "Anger", 6: "Neutral"}

# Text Processing Functions
def simple_tokenize(text):
    return re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()

def build_vocab(texts, min_freq=1):
    word_counts = Counter()
    for text in texts:
        word_counts.update(simple_tokenize(text))
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def tokenize_text(text, vocab, max_length=10):
    tokens = simple_tokenize(text)
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(token_ids) > max_length:
        token_ids = token_ids[:max_length]
    else:
        token_ids.extend([vocab['<PAD>']] * (max_length - len(token_ids)))
    return torch.tensor(token_ids, dtype=torch.long)
    

# Clean Multimodal Dataset - HF Emotion Data Only
class MultimodalDataset(Dataset):
    def __init__(self, data_directory, transform=None):
        self.data = ImageFolder(data_directory, transform=transform)
        self.text_metadata = self._generate_text()

    def _generate_text(self):
        """Generate text from Hugging Face emotion dataset"""
        # Map HF emotion labels to RAF-DB emotions
        hf_to_raf_mapping = {
            0: 5,  # anger -> anger (folder 6)
            1: 1,  # fear -> fear (folder 2)
            2: 3,  # joy -> happiness (folder 4)
            3: 3,  # love -> happiness (folder 4) - could also be sadness
            4: 4,  # sadness -> sadness (folder 5)
            5: 0   # surprise -> surprise (folder 1)
        }
        
        texts = []
        for i in range(len(self.data)):
            _, original_label = self.data[i]
            
            # Find matching HF emotion texts
            matching_hf_label = None
            for hf_label, raf_label in hf_to_raf_mapping.items():
                if raf_label == original_label:
                    matching_hf_label = hf_label
                    break
            
            if matching_hf_label is not None:
                matching_texts = df_emotion[df_emotion['label'] == matching_hf_label]['text'].tolist()
                if matching_texts:
                    texts.append(random.choice(matching_texts))
                else:
                    texts.append(f"person expressing {self.data.classes[original_label]} emotion")
            else:
                texts.append(f"person expressing {self.data.classes[original_label]} emotion")
        
        return texts
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        image, label = self.data[idx]
        text = self.text_metadata[idx]
        return image, label, text


# Architecture: ResNet-18 + GRU + Dropout + Linear
class MultimodalEmotionClassifier(nn.Module):
    def __init__(self, vocab_size=1000, num_classes=7, dropout_p=0.5):
        super().__init__()
        
        # Image Backbone: ResNet-18 (pretrained) → 512-D image feature
        self.image_backbone = torchvision.models.resnet18(pretrained=True)
        self.image_backbone = nn.Sequential(*list(self.image_backbone.children())[:-1])
        
        # Text Encoder: GRU (RNN) → 512-D text embedding
        self.text_embedding = nn.Embedding(vocab_size, 512, padding_idx=0)
        self.text_encoder = nn.GRU(input_size=512, hidden_size=512, 
                                 num_layers=1, batch_first=True, bidirectional=False)
        
        # Fusion: Concatenate [512-D image, 512-D text] → 1024-D
        # Dropout (p=0.5): randomly drops 50% of fused features during training
        self.dropout = nn.Dropout(p=dropout_p)
        
        # Head: Linear (1024 → 7), Softmax for probabilities
        self.classifier = nn.Linear(1024, num_classes)
        self.softmax = nn.Softmax(dim=1)
        
 #This is the fusion layer       
    def forward(self, images, text_tokens):
        # Image processing: ResNet-18 → 512-D
        image_features = self.image_backbone(images).view(images.size(0), -1)
        
        # Text processing: GRU → 512-D
        text_embedded = self.text_embedding(text_tokens)
        text_output, _ = self.text_encoder(text_embedded)
        text_features = text_output[:, -1, :]
        
        # Fusion: Concatenate [512-D image, 512-D text] → 1024-D
        fused_features = torch.cat([image_features, text_features], dim=1)
        
        # Dropout and classification
        fused_features = self.dropout(fused_features)
        logits = self.classifier(fused_features)
        probabilities = self.softmax(logits)
        
        return logits, probabilities




✓ Loaded text emotion dataset: 16000 samples


## Data Loading

In [51]:
# Data Loading & Model Setup
# IMG_DATASET_LOCATION has test and train subdirectories, each with dir 1-7 containing the emotion


dataset_train = MultimodalDataset(f"{IMG_DATASET_LOCATION}/train", transform)
dataset_test = MultimodalDataset(f"{IMG_DATASET_LOCATION}/test", transform)

# Split training data into train/validation (80/20 split)
train_size = int(0.8 * len(dataset_train))
val_size = len(dataset_train) - train_size
dataset_train, dataset_valid = random_split(dataset_train, [train_size, val_size])

# Build vocabulary from training texts
all_texts = [dataset_train[i][2] for i in range(len(dataset_train))]
vocab = build_vocab(all_texts, min_freq=1)
vocab_size = len(vocab)


def collate_fn(batch):
    images, labels, texts = zip(*batch)
    images = torch.stack(images)
    labels = torch.tensor(labels)
    text_tokens = torch.stack([tokenize_text(text, vocab) for text in texts])
    return images, labels, text_tokens

# Create model
model = MultimodalEmotionClassifier(vocab_size=vocab_size, num_classes=7)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)


class_weights = torch.FloatTensor([
    0.6571550965309143,  # Class 0: Surprise
    3.0168328285217285,  # Class 1: Fear (highest weight - rarest class)
    1.1823291778564453,  # Class 2: Disgust
    0.1776466965675354,  # Class 3: Happiness (lowest weight - most common)
    0.427714467048645,   # Class 4: Sadness
    1.2024539709091187,  # Class 5: Anger (higher weight - helps with your issue)
    0.335867702960968    # Class 6: Neutral
])

class_weights = class_weights.to(device)


# Training Setup
batch_size = 128 
num_epochs = 5
learning_rate = 0.003
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss(weight=class_weights)


loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"Dataset sizes: Train={len(dataset_train)}, Valid={len(dataset_valid)}, Test={len(dataset_test)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")
print(f"Using device: {device}")

print(f"Training Configuration:")
print(f"  Batch Size: {batch_size}")
print(f"  Learning Rate: {learning_rate}")
print(f"  Epochs: {num_epochs}")
print(f"  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print()

training_losses = []
validation_losses = []




Dataset sizes: Train=9816, Valid=2455, Test=3068
Vocabulary size: 6796
Model parameters: 16239175
Using device: cuda:0
Training Configuration:
  Batch Size: 128
  Learning Rate: 0.003
  Epochs: 5
  GPU: NVIDIA GeForce RTX 5060 Ti



## Training

In [52]:
# Training Loop
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    running_loss = 0.0
    
    for images, labels, text_tokens in tqdm(loader_train, desc=f'Epoch {epoch+1}/{num_epochs} - Training'):
        images = images.to(device)
        labels = labels.to(device)
        text_tokens = text_tokens.to(device)
        
        optimizer.zero_grad()
        logits, probabilities = model(images, text_tokens)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * labels.size(0)
    
    train_loss = running_loss / len(loader_train.dataset)
    training_losses.append(train_loss)
    
    # Validation Phase
    model.eval()
    running_loss = 0.0
    
    with torch.no_grad():
        for images, labels, text_tokens in tqdm(loader_valid, desc=f'Epoch {epoch+1}/{num_epochs} - Validation'):
            images = images.to(device)
            labels = labels.to(device)
            text_tokens = text_tokens.to(device)
            
            logits, probabilities = model(images, text_tokens)
            loss = criterion(logits, labels)
            running_loss += loss.item() * labels.size(0)
    
    valid_loss = running_loss / len(loader_valid.dataset)
    validation_losses.append(valid_loss)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Train loss: {train_loss:.4f}, Validation loss: {valid_loss:.4f}")

print("Training completed!")


Epoch 1/5 - Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch 1/5 - Validation:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1/5 - Train loss: 1.4679, Validation loss: 5.1550


Epoch 2/5 - Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch 2/5 - Validation:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 2/5 - Train loss: 0.7794, Validation loss: 0.8147


Epoch 3/5 - Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch 3/5 - Validation:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 3/5 - Train loss: 0.4071, Validation loss: 0.7468


Epoch 4/5 - Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch 4/5 - Validation:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 4/5 - Train loss: 0.2982, Validation loss: 0.8570


Epoch 5/5 - Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch 5/5 - Validation:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 5/5 - Train loss: 0.2510, Validation loss: 0.8976
Training completed!


## Testing

In [53]:
# Test Model
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels, text_tokens in tqdm(loader_test, desc='Testing'):
        images = images.to(device)
        labels = labels.to(device)
        text_tokens = text_tokens.to(device)
        
        logits, probabilities = model(images, text_tokens)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


Testing:   0%|          | 0/24 [00:00<?, ?it/s]

Test Accuracy: 87.45%


In [58]:
# Test Single Image Prediction

image_path = "imgtest1.jpg"

def predict_single_image(model, image_path, transform, vocab, device, label_dict):
    """
    Predict emotion for a single image
    """
    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform(image).unsqueeze(0).to(device)
    
    # Generate text for the image
    text = "person showing emotional expression"
    text_tokens = tokenize_text(text, vocab).unsqueeze(0).to(device)
    
    # Make prediction
    model.eval()
    with torch.no_grad():
        logits, probabilities = model(image_tensor, text_tokens)
        confidence, predicted_class = torch.max(probabilities, 1)
        
        predicted_class = predicted_class.cpu().item()
        confidence = confidence.cpu().item()
        probabilities = probabilities.cpu().numpy()[0]
    
    return predicted_class, confidence, probabilities

def test_image_prediction(image_path):
    """
    Test prediction on a single image
    """
    if not os.path.exists(image_path):
        print(f"❌ Image not found: {image_path}")
        return
    
    print(f"🔍 Testing image: {image_path}")
    
    # Make prediction
    predicted_class, confidence, probabilities = predict_single_image(
        model, image_path, transform, vocab, device, label_dict
    )
    
    # Show results
    print(f"Predicted Emotion: {label_dict[predicted_class]}")
    print(f"Confidence: {confidence:.3f}")
    print("All Class Probabilities:")
    for i, (emotion, prob) in enumerate(zip(list(label_dict.values()), probabilities)):
        print(f"  {emotion}: {prob:.3f}")
    
    return predicted_class, confidence, probabilities

print("✅ Image prediction functions ready!")
print("Usage:")
print(test_image_prediction(image_path))

✅ Image prediction functions ready!
Usage:
🔍 Testing image: imgtest1.jpg
Predicted Emotion: Neutral
Confidence: 0.672
All Class Probabilities:
  Surprise: 0.000
  Fear: 0.000
  Disgust: 0.326
  Happiness: 0.000
  Sadness: 0.001
  Anger: 0.000
  Neutral: 0.672
(6, 0.6717141270637512, array([1.7338122e-04, 1.7356584e-04, 3.2637322e-01, 3.6848392e-04,
       8.0341188e-04, 3.9380474e-04, 6.7171413e-01], dtype=float32))
