
# Model A : Multimodal Emotion Recognition (ResNet18 + GRU)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models


# Architecture Overview

### Image Backbone : ResNet-18 (pretrained) → 512-D image feature
### Text Encoder   : GRU (Embedding + GRU)  → 512-D text embedding
### Fusion         : Concatenate [512 + 512] → 1024-D
### Dropout        : p = 0.5
### Head           : Linear(1024 → 7) + Softmax
### Loss           : CrossEntropy


In [None]:
class ModelA_MultimodalEmotionNet(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=512, num_classes=7):
        super(ModelA_MultimodalEmotionNet, self).__init__()

 ###  Image Path 

In [None]:
base_resnet = models.resnet18(pretrained=True)
        modules = list(base_resnet.children())[:-1]  # remove final classification layer
        self.image_encoder = nn.Sequential(*modules)
        self.image_fc = nn.Linear(512, hidden_dim)


### - Text Path -
  

In [None]:
      self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        
        

## - Fusion + Classification -
      

In [None]:
  self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, images, text):
        

## Image pipeline
        

In [None]:
img_feat = self.image_encoder(images).view(images.size(0), -1)
        img_feat = self.image_fc(img_feat)

        

## Text pipeline
        

In [None]:
embedded = self.embedding(text)
        _, text_feat = self.gru(embedded)
        text_feat = text_feat.squeeze(0)

        

## Fusion
        

In [None]:
fused = torch.cat((img_feat, text_feat), dim=1)
        fused = self.dropout(fused)

        

## Classification
        

In [None]:
logits = self.fc(fused)
        probs = self.softmax(logits)
        return probs



# Example Initialization



In [None]:
if __name__ == "__main__":
    modelA = ModelA_MultimodalEmotionNet(vocab_size=10000, num_classes=7)
    sample_image = torch.randn(8, 3, 224, 224)   # batch of 8 images
    sample_text = torch.randint(0, 9999, (8, 20)) # batch of 8 text sequences
    output = modelA(sample_image, sample_text)
    print("Output shape:", output.shape)  # expected [8, 7]