# Architecture B

<!-- MODIFIED: Updated from Architecture A to Architecture B with Transformer encoder -->

**Architecture Specifications:**
- **Image Input**: 100×100 grayscale → 224×224 RGB (ResNet-18 compatible)
- **Image Backbone**: ResNet-18 (pretrained) → 512-D image feature
- **Text Input**: Short text metadata (tokenized with subword units, e.g., BPE)
- **Text Encoder**: Transformer encoder (2–4 layers, 4–8 heads) → 512-D text embedding
- **Fusion**: Concatenate [512-D image, 512-D text] → 1024-D
- **Dropout**: p=0.3 (randomly drops ~30% of fused features during training)
- **Head**: Linear (1024 → 7), Softmax for probabilities
- **Loss**: Cross-Entropy

## Specific Imports
imports used for the specific model tasks

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import timm

import pandas as pd
import numpy as np
import random

import re
from datasets import load_dataset

import sys
import os
from tqdm.notebook import tqdm

import kaggle
import kagglehub
from kagglehub import KaggleDatasetAdapter
import time
import math

# Set up device for GPU/CPU usage throughout the notebook
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check CUDA availability and GPU info
if torch.cuda.is_available():
    print(f"CUDA is available!")
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.current_device()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("CUDA is not available, using CPU")


Using device: cuda:0
CUDA is available!
GPU count: 1
Current GPU: 0
GPU name: NVIDIA GeForce RTX 5060 Ti
GPU memory: 15.5 GB


In [None]:
# ADDED: BPE (Byte Pair Encoding) Tokenization for Architecture B
def simple_tokenize(text):
    """Basic tokenization - can be enhanced with BPE tokenizer"""
    return re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()

def build_vocab(text_in, min_freq=1):
    """Build vocabulary with subword units (simplified BPE approach)"""
    word_counts = Counter()
    for text in text_in:
        word_counts.update(simple_tokenize(text))

    vocab = {'<PAD>': 0, '<UNK>': 1, '<BOS>': 2, '<EOS>': 3}
    
    # Add subword units (simplified BPE-like approach)
    subword_units = set()
    for word, count in word_counts.items():
        if count >= min_freq:
            # Add full word
            vocab[word] = len(vocab)
            # Add character-level subwords for better coverage
            for i in range(len(word)):
                for j in range(i+1, min(i+4, len(word)+1)):  # 2-4 character subwords
                    subword = word[i:j]
                    if subword not in vocab:
                        vocab[subword] = len(vocab)

    return vocab

def tokenize_text(text, vocab, max_length=15):
    """Tokenize text with subword units (BPE-like)"""
    tokens = simple_tokenize(text)
    token_ids = []
    
    for token in tokens:
        if token in vocab:
            token_ids.append(vocab[token])
        else:
            # Try subword decomposition
            subword_found = False
            for i in range(len(token)):
                for j in range(i+1, min(i+4, len(token)+1)):
                    subword = token[i:j]
                    if subword in vocab:
                        token_ids.append(vocab[subword])
                        subword_found = True
                        break
                if subword_found:
                    break
            if not subword_found:
                token_ids.append(vocab['<UNK>'])
    
    # Truncate or pad to max_length
    if len(token_ids) > max_length:
        token_ids = token_ids[:max_length]
    else:
        token_ids.extend([vocab['<PAD>']] * (max_length - len(token_ids)))
    
    return torch.tensor(token_ids, dtype=torch.long)


# Text Processing Functions

In [3]:
def simple_tokenize(text):
    return re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()

def build_vocab(text_in, min_freq=1):
    word_counts = Counter()
    for text in text_in:
        word_counts.update(simple_tokenize(text))

    vocab = {'<PAD>':0, '<UNK>':1}#

    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = len(vocab)

    return vocab

def tokenize_text(text, vocab, max_length=10):
    tokens = simple_tokenize(text)
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
   
    if len(token_ids) > max_length:
        token_ids = token_ids[:max_length]
    else:
        token_ids.extend([vocab['<PAD>']] * (max_length - len(token_ids)))
    
    return torch.tensor(token_ids, dtype=torch.long)      
    

# Dataset Setup

In [4]:
#Extremely simple data set class.
class OurDataSetA(Dataset):
    def __init__(self, data_directory, transform=None):
        self.data = ImageFolder(data_directory, transform=transform)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, at_index):
        return self.data[at_index]

    @property
    def classes(self):
        return self.data.classes
#END CLASS

path = kagglehub.dataset_download("dollyprajapati182/balanced-raf-db-dataset-7575-grayscale")

print("Path to dataset files:", path)


#Strings of data directories
str_data_dir_train = path + '/train'
str_data_dir_valid = path + '/val'
str_data_dir_test  = path + '/test'


#Transform
# A- this is meant for the balanced grey-scale RAF data set
transform_a = transforms.Compose([
    transforms.ToTensor()
])

# B- this is meant for the RAF data set
transform_b = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),   
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Highly technical super nuanced and novel way of setting the transform to use.
transform_used = transform_a

#Dataset variables
dataset_train = OurDataSetA(str_data_dir_train, transform_used)
dataset_valid = OurDataSetA(str_data_dir_valid, transform_used)
dataset_test  = OurDataSetA(str_data_dir_test, transform_used)

#Label Dictionary
label_dict ={
    0:"Angry",
    1:"Disgust",
    2:"Fear",
    3:"Happy",
    4:"Neutral",
    5:"Sad",
    6:"Surprise"
}

#This just serves to take a random snippet from the dataset and display it for demo purposes
var_rand = random.randint(1,(4289*7))
image, label = dataset_train[var_rand]
print("index used: ", var_rand)
print(label_dict[label])
image

Path to dataset files: /home/amherscher/.cache/kagglehub/datasets/dollyprajapati182/balanced-raf-db-dataset-7575-grayscale/versions/1
index used:  9309
Fear


tensor([[[0.1255, 0.1294, 0.2039,  ..., 0.0275, 0.0275, 0.0275],
         [0.1020, 0.1255, 0.1725,  ..., 0.0275, 0.0275, 0.0275],
         [0.0980, 0.1451, 0.1686,  ..., 0.0275, 0.0275, 0.0275],
         ...,
         [0.1569, 0.0471, 0.1255,  ..., 0.0627, 0.0314, 0.0275],
         [0.1647, 0.0667, 0.0980,  ..., 0.0275, 0.0275, 0.0275],
         [0.1647, 0.0902, 0.0784,  ..., 0.0275, 0.0275, 0.0353]],

        [[0.1255, 0.1294, 0.2039,  ..., 0.0275, 0.0275, 0.0275],
         [0.1020, 0.1255, 0.1725,  ..., 0.0275, 0.0275, 0.0275],
         [0.0980, 0.1451, 0.1686,  ..., 0.0275, 0.0275, 0.0275],
         ...,
         [0.1569, 0.0471, 0.1255,  ..., 0.0627, 0.0314, 0.0275],
         [0.1647, 0.0667, 0.0980,  ..., 0.0275, 0.0275, 0.0275],
         [0.1647, 0.0902, 0.0784,  ..., 0.0275, 0.0275, 0.0353]],

        [[0.1255, 0.1294, 0.2039,  ..., 0.0275, 0.0275, 0.0275],
         [0.1020, 0.1255, 0.1725,  ..., 0.0275, 0.0275, 0.0275],
         [0.0980, 0.1451, 0.1686,  ..., 0.0275, 0.0275, 0.

## Multi Modal Dataset

In [5]:
class OurMultiModalDataSet(Dataset):
    def __init__(self, data_directory, text_data, transform=None):
        self.data_image = ImageFolder(data_directory, transform=transform)
        self.data_text = text_data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, at_index):
        return self.data_image[at_index], self.data_text[at_index]

    @property
    def classes(self):
        return self.data.classes
#END CLASS

# Data Loaders
Starting with images.

In [6]:
#These just make use of the pre-made class DataLoader so there's no need to define our own here
batch_size = 128

loader_train = DataLoader(dataset_train, batch_size = batch_size, shuffle = True)
loader_valid = DataLoader(dataset_valid, batch_size = batch_size, shuffle = False)
loader_test  = DataLoader(dataset_test, batch_size = batch_size, shuffle = False)

# Text data loading

## Translation Dictionary
The text uses a different classification index versus the images, so we create a dictionary to translate between them.

In [7]:
# Text Class Number : Images Class Number
# So you can put in the text class number and get out the version of the number as the images dataset uses.
translation_dictionary = {
    0:5, #Sadness -> Sad
    1:3, #Joy -> Happy
    2:3, #Love -> Happy
    3:0, #Anger -> Angry
    4:2, #Fear -> Fear
    5:6  #Surprise -> Surprise
}

## Reading and Splitting
Actually loading

In [8]:
str_text_data_dir = "bhavikjikadara/emotions-dataset"

# Download the dataset first
dataset_path = kagglehub.dataset_download(str_text_data_dir)
print("Dataset downloaded to:", dataset_path)

# Load the CSV file from the downloaded dataset
import os
csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')]
if csv_files:
    csv_path = os.path.join(dataset_path, csv_files[0])
    complete_csv = pd.read_csv(csv_path)

#Read CSV into a data-frame
print("Preview of the CSV contents:")
print(complete_csv)
print("-- -- -- -- -- -- --")

#Fix class labeling missmatch
complete_csv['label'] = complete_csv['label'].replace(translation_dictionary)
print("Preview of altered CSV contents:")
print(complete_csv)
print("-- -- -- -- -- -- --")

#Split CSV into segments for Testing, Training, and Validation.


Dataset downloaded to: /home/amherscher/.cache/kagglehub/datasets/bhavikjikadara/emotions-dataset/versions/1
Preview of the CSV contents:
                                                     text  label
0           i just feel really helpless and heavy hearted      4
1       ive enjoyed being able to slouch about relax a...      0
2       i gave up my internship with the dmrg and am f...      4
3                              i dont know i feel so lost      0
4       i am a kindergarten teacher and i am thoroughl...      4
...                                                   ...    ...
416804  i feel like telling these horny devils to find...      2
416805  i began to realize that when i was feeling agi...      3
416806  i feel very curious be why previous early dawn...      5
416807  i feel that becuase of the tyranical nature of...      3
416808  i think that after i had spent some time inves...      5

[416809 rows x 2 columns]
-- -- -- -- -- -- --
Preview of altered CSV contents:
 

# Classifier
the model itself for simple tasks of classification.

In [9]:
# Transformer Encoder for Text Processing (Architecture B)
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = self._create_positional_encoding(d_model)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=2048,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Output projection to 512-D
        self.output_proj = nn.Linear(d_model, 512)
        
    def _create_positional_encoding(self, d_model, max_len=100):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)
    
    def forward(self, text_tokens):
        # Get sequence length
        seq_len = text_tokens.size(1)
        
        # Embedding + positional encoding
        embedded = self.embedding(text_tokens) * math.sqrt(self.d_model)
        embedded = embedded + self.pos_encoding[:, :seq_len, :].to(text_tokens.device)
        
        # Create attention mask for padding tokens
        attention_mask = (text_tokens != 0).float()
        
        # Transformer encoding
        transformer_output = self.transformer(embedded, src_key_padding_mask=attention_mask == 0)
        
        # Global average pooling (mean of non-padded tokens)
        mask = attention_mask.unsqueeze(-1).expand_as(transformer_output)
        masked_output = transformer_output * mask
        text_features = masked_output.sum(dim=1) / mask.sum(dim=1)
        
        # Project to 512-D
        text_features = self.output_proj(text_features)
        
        return text_features


In [10]:
class EmotionClassifier(nn.Module):
    def __init__(self, num_classes=7):
        super().__init__()
        # Use torchvision instead of timm
        self.base_model = torchvision.models.resnet18(pretrained=True)
        self.features = nn.Sequential(*list(self.base_model.children())[:-1])

        enet_out_size = 512
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(enet_out_size, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        output = self.classifier(x)
        return output
#END CLASS

#Create the model, we'll call it model_one.
model_one = EmotionClassifier(num_classes=7)

# Move model to device
model_one.to(device)
print(f"MultiModal model moved to {device}")

#this is just done to show a snippet of the models layout.
print(str(model_one)[:300])
        

MultiModal model moved to cuda:0
EmotionClassifier(
  (base_model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, strid




## MultiModal Classifier

In [11]:
# Architecture B: MultiModal Emotion Classifier with Transformer
class MultiModalEmotionClassifierB(nn.Module):
    def __init__(self, num_classes=7, vocab_size=1000, dropout_p=0.3):
        super().__init__()
        enet_out_size = 512
        
        #Image Model (Resnet18)
        self.base_image_model = torchvision.models.resnet18(pretrained=True) #Set base model
        self.features = nn.Sequential(*list(self.base_image_model.children())[:-1])

        #Text Model (Transformer) - Architecture B
        self.text_encoder = TransformerEncoder(vocab_size=vocab_size, d_model=512, nhead=8, num_layers=3)

        #Dropout Method (updated to p=0.3)
        self.dropout = nn.Dropout(p=dropout_p)
        
        # Updated classifier for 1024-D input (512 image + 512 text)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024, num_classes)  # Changed from 512 to 1024
        )

    def forward(self, images, text_tokens):
        # Image Processing
        image_features = self.features(images).view(images.size(0), -1)

        # Text Processing with Transformer
        text_features = self.text_encoder(text_tokens)

        # Fusion: Concatenate [512-D image, 512-D text] → 1024-D
        fused_features = torch.cat([image_features, text_features], dim=1)

        # Dropout, randomly select p% of features to drop
        fused_features = self.dropout(fused_features)
        
        # Classify
        output = self.classifier(fused_features)
        return output
#END CLASS

#Create the Architecture B model
model_multi_b = MultiModalEmotionClassifierB(num_classes=7)

model_multi_b.to(device)

#this is just done to show a snippet of the models layout.
print("Architecture B - Transformer-based Multimodal Model:")
print(str(model_multi_b)[:500])


Architecture B - Transformer-based Multimodal Model:
MultiModalEmotionClassifierB(
  (base_image_model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=F


In [12]:
class MultiModalEmotionClassifier(nn.Module):
    def __init__(self, num_classes=7, vocab_size=1000, dropout_p=0.5):
        super().__init__()
        enet_out_size = 512
        
        #Image Model (Resnet18)
        self.base_image_model = torchvision.models.resnet18(pretrained=True) #Set base model
        self.features = nn.Sequential(*list(self.base_image_model.children())[:-1])

        #Text Model (GRU)
        self.base_text_model = nn.Embedding(vocab_size, 512, padding_idx=0)
        self.text_Encoder = nn.GRU(512,512,num_layers=1,batch_first=True,bidirectional=False)

        #Dropout Method
        self.dropout = nn.Dropout(p=dropout_p) # Just set it to the nn.method
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(enet_out_size, num_classes)
        )

    def forward(self, images, text):
        # Image Processing
        image_features = self.features(images)

        # Text Processing
        text_embedded = self.text_embedding(text_tokens)
        text_output, hidden_layer = self.text_encoder(text_embedded)
        texture_features = text_output[:,-1,:]

        # Fusion
        fused_features = torch.cat([image_features, text_features], dim=1)

        # Dropout, randomly select p% of features to drop
        fused_features = self.dropout(fused_features)
        
        # Classify
        output = self.classifier(fused_features)
        return output
#END CLASS

#Create the model, we'll call it model_multi because its a multi modal model.
model_multi = MultiModalEmotionClassifier(num_classes=7)

model_multi.to(device)

#this is just done to show a snippet of the models layout.
print(str(model_multi)[:300])

MultiModalEmotionClassifier(
  (base_image_model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kern


# Training
Queue eye of the tiger

In [13]:
#Set up for the loop
# Loss Function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model_one.parameters(), lr=0.001)

# Length in Epochs
number_of_epochs = 5

# Losses Arrays
training_losses = []
validation_losses = []

model_one.to(device) 
print('--') #Kaggle really really wants to put things so if I don't give it this it outputs the entire layout of the model below, and its a lot of text man.


--


In [None]:
#The actual loop
print(f"Starting training for {number_of_epochs} epochs...")
print(f"Training batches: {len(loader_train)}")
print(f"Validation batches: {len(loader_valid)}")

# Start total timer
total_start_time = time.time()

for epoch in range(number_of_epochs):
    # Start epoch timer
    epoch_start_time = time.time()
    print(f"\n=== EPOCH {epoch+1}/{number_of_epochs} ===")
    #Training Phase
    model_one.train() #Signal to the model that we're training
    running_loss = 0.0 #Current loss of the session
    for images, labels in tqdm(loader_train, desc=f'Epoch {epoch+1}/{number_of_epochs} - Training'): 
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        #
        optimizer.zero_grad()
        outputs = model_one(images)##
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * labels.size(0)
        
    # Loss Tracking
    train_loss = running_loss / len(loader_train.dataset)
    training_losses.append(train_loss)
    
    #Validation Phase
    model_one.eval() #Signal to the model that we're not training.
    running_loss = 0.0
    with torch.no_grad():
        for images, labels in tqdm(loader_valid, desc=f'Epoch {epoch+1}/{number_of_epochs} - Validation'):
            # Move inputs and labels to the device
            images = images.to(device)
            labels = labels.to(device)

            #
            outputs = model_one(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * labels.size(0)

    # Loss Tracking
    valid_loss = running_loss / len(loader_valid.dataset)
    validation_losses.append(valid_loss)
        # Calculate epoch time
    epoch_time = time.time() - epoch_start_time
    total_time = time.time() - total_start_time
    
    print(f"Epoch {epoch+1}/{number_of_epochs} - Train loss: {train_loss:.4f}, Validation loss: {valid_loss:.4f}")
    print(f"Epoch time: {epoch_time:.2f}s | Total time: {total_time:.2f}s")
    
    # Epoch progress bar
    progress = (epoch + 1) / number_of_epochs
    bar_length = 30
    filled_length = int(bar_length * progress)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)

    
# Final timing
total_training_time = time.time() - total_start_time
print(f"\n🎉 Training completed!")
print(f"Total training time: {total_training_time:.2f}s ({total_training_time/60:.1f} minutes)")
print(f"Average time per epoch: {total_training_time/number_of_epochs:.2f}s")


# Notes

In [None]:
# KAGGLE setup instructions:
# 1. Get Kaggle API key from https://www.kaggle.com/account
#    Go to https://www.kaggle.com/account
#    Click "Create New API Token"
#    Download kaggle.json
# 2. Place kaggle.json in ~/.kaggle/ directory
# 3. Run this notebook - datasets will download automatically


#For Model B:
    #Changed text encoder from GRU to Transformer
    #Updated dropout from p=0.5 to p=0.3
    #Updated text input to mention BPE tokenization
    #Added comment: # ADDED: BPE (Byte Pair Encoding) Tokenization for Architecture B
    #Complete Transformer encoder implementation
    #3 layers, 8 attention heads, 512-D model
    #Positional encoding and attention masking


