In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import pandas as pd

from transformers import BertTokenizer, VisualBertModel,  BertConfig, BertModel

In [2]:
import os
import numpy as np
import pandas as pd

# torch
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# from visualbert
from processing_image import Preprocess
from utils import Config
from modeling_frcnn import GeneralizedRCNN

In [3]:
class ImageProcessor:
    def __init__(self, device='cuda'):
        frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        frcnn_cfg.MODEL.DEVICE = device
        self.device = device

        self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)

        self.frcnn_cfg = frcnn_cfg
        self.image_preprocess = Preprocess(frcnn_cfg)

    def get_visual_embeddings(self, image_path):
        # run frcnn
        images, sizes, scales_yx = self.image_preprocess(image_path)

        output_dict = self.frcnn(
            images,
            sizes,
            scales_yx=scales_yx,
            padding="max_detections",
            max_detections=self.frcnn_cfg.max_detections,
            return_tensors="pt",
        )
        features = output_dict.get("roi_features").detach().cpu()
        return features

## View Configuration of Teacher Model

In [4]:
class TeacherModel(nn.Module):
    def __init__(self, visualbert_model):
        super(TeacherModel, self).__init__()
        self.visualbert = visualbert_model
        self.fc = nn.Linear(self.visualbert.config.hidden_size, num_emotions)

    def forward(self, input_ids, token_type_ids, attention_mask, visual_embeds, visual_token_type_ids, visual_attention_mask, labels):
        visualbert_outputs = self.visualbert(input_ids=input_ids.squeeze(1),
                                             attention_mask=attention_mask.squeeze(1),
                                             token_type_ids=token_type_ids.squeeze(1),
                                             visual_embeds=visual_embeds.squeeze(1),
                                            visual_token_type_ids=visual_token_type_ids.squeeze(1),
                                            visual_attention_mask=visual_attention_mask.squeeze(1))
        pooled_output = visualbert_outputs['pooler_output']

        # Emotion prediction
        logits = self.fc(pooled_output) # Loss function operates from logits
        
        # probabilities = F.softmax(logits, dim=-1)
        return logits

In [5]:
visualbert_model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

In [6]:
for param in visualbert_model.parameters():
    param.requires_grad = False

In [7]:
num_emotions = 29

In [8]:
# Load the trained teacher model
teacher_model = TeacherModel(visualbert_model)

# Load the state_dict of the model from the .pth file
teacher_model_path = 'trained_teacher_model.pth'
state_dict = torch.load(teacher_model_path)

# Load the state_dict into the model
teacher_model.load_state_dict(state_dict)

# Print the model architecture
print(teacher_model)

TeacherModel(
  (visualbert): VisualBertModel(
    (embeddings): VisualBertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (visual_token_type_embeddings): Embedding(2, 768)
      (visual_position_embeddings): Embedding(512, 768)
      (visual_projection): Linear(in_features=2048, out_features=768, bias=True)
    )
    (encoder): VisualBertEncoder(
      (layer): ModuleList(
        (0-11): 12 x VisualBertLayer(
          (attention): VisualBertAttention(
            (self): VisualBertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropo

## View Configuration of Student Model

In [9]:
class SimpleMultimodalModel(nn.Module):
    def __init__(self, num_classes, embedding_dim=128):
        super(SimpleMultimodalModel, self).__init__()

        # Text encoder: Embedding layer + GRU
        self.embedding = nn.Embedding(30522, embedding_dim)  # Using 30522 for BERT's Base uncased tokenizer
        self.gru = nn.GRU(embedding_dim, 256, batch_first=True)

        # Image encoder: Simplified CNN
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))

        # Fusion layer and classifier
        self.fc1 = nn.Linear(64 + 256, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask, images):
        # Process text input through embedding and GRU
        x_text = self.embedding(input_ids)
        _, x_text = self.gru(x_text)
        x_text = x_text.squeeze(0)  # Remove sequence dimension

        # Process images through simplified CNN
        x_img = F.relu(self.conv1(images))
        x_img = F.relu(self.conv2(x_img))
        x_img = F.relu(self.conv3(x_img))
        x_img = self.adaptive_pool(x_img)
        x_img = torch.flatten(x_img, 1)  # Flatten all dimensions except batch

        # Fusion and classification
        combined_features = torch.cat((x_img, x_text), dim=1)
        fused_features = F.relu(self.fc1(combined_features))
        logits = self.fc2(fused_features)

        # Apply sigmoid to convert logits to probabilities
        probabilities = torch.sigmoid(logits)
        
        return probabilities

In [10]:
# Load the trained teacher model
student_model = SimpleMultimodalModel(num_emotions)

# Load the state_dict of the model from the .pth file
student_model_path = 'trained_student_model.pth'
state_dict = torch.load(student_model_path)

# Load the state_dict into the model
student_model.load_state_dict(state_dict)

# Print the model architecture
print(student_model)

SimpleMultimodalModel(
  (embedding): Embedding(30522, 128)
  (gru): GRU(128, 256, batch_first=True)
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (adaptive_pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc1): Linear(in_features=320, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=29, bias=True)
)


# Transfer Learning Overview
Given our source dataset, COCO (A), and target dataset, Socratis (B), we will experiment with 2 finetuning approaches based on [Yosinski et al](https://proceedings.neurips.cc/paper_files/paper/2014/file/375c71349b295fbe2dcdca9206f20a06-Paper.pdf). In both cases, initialize the parameters of the teacher model (B) with the pre-trained weights of the student model (A).
- AnB: finetuning but keeping all the layers frozen
- AnB+: finetuning but not freezing the layers

However, we can see that the architectures and dimensions for models A and B are very different so here is how we account for that:
we initialize the last layer (fc) in model B with a modified version of the last two layers (fc1 and fc2) in model A. This involves adapting the dimensions of the weights to ensure compatibility between the models. In the AnB case, we would freeze all layers in model B except the last layer (fc), allowing it to adapt to the target task while keeping the rest of the model fixed. Conversely, in the AnB+ case, we would not freeze any layer in model B, enabling all layers, including the last layer (fc), to be fine-tuned on the target dataset. 

In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import BertTokenizer
from transformers import BertModel
from PIL import Image
import requests
from io import BytesIO
import torch.optim as optim
import torch.nn as nn
import os
import pickle
from torchvision import models
from PIL import Image
import torch.nn.functional as F

## Load Datasets A and B

### Dataset A

In [12]:
df = pd.read_csv('coco_predictions.csv')

In [13]:
# DatasetA --> COCO
class DatasetA(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.transform = transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
            ])
        self.tokenizer = tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_url = row['coco_url']
        caption = eval(row['captions'])[0]  # Evaluating the string to get the list and taking the first item

        # Image processing
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert image to RGB
        if self.transform:
            image = self.transform(image)

        # Text processing
        inputs = self.tokenizer(caption, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        input_ids, attention_mask = inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

        # Labels - extracting the last 29 columns as classes
        labels = torch.tensor(row[2:].values.astype(float), dtype=torch.float32)

        return input_ids, attention_mask, image, labels

In [14]:
def preprocess_image(url):
    # Load image from URL
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = transform(img).unsqueeze(0)  # Add batch dimension
    return img

def prepare_text(text):
    # Tokenize text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    return inputs['input_ids'], inputs['attention_mask']

In [15]:
dataset_A = DatasetA(df.sample(frac=0.1, random_state=42) )
dataloader_A = DataLoader(dataset_A, batch_size=64, shuffle=False)

In [16]:
dataset_A[0]

(tensor([  101,  1037, 18097, 13184,  2099,  2279,  2000,  1037, 16247,  2327,
         17428,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

### Dataset B

In [17]:
df = pd.read_csv('cleaned_data.csv')

In [18]:
# DatasetB --> Cleaned Socratis
class DatasetB(Dataset):
    def __init__(self, data, images_base_path, device='cuda'):
        self.df = data
        self.images_base_path = images_base_path

        # feature extractors
        self.tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        self.visual_extractor = ImageProcessor(device=device)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        captions, image_name = self.df.iloc[idx]['caption'], self.df.iloc[idx]['image_name']

        # get image embedings
        image_path = os.path.join(self.images_base_path, image_name)
        visual_embeds = self.visual_extractor.get_visual_embeddings(image_path)
        visual_token_type_ids = torch.ones(
            visual_embeds.shape[:-1], dtype=torch.long)
        visual_attention_mask = torch.ones(
            visual_embeds.shape[:-1], dtype=torch.float)

        # get text embeddings
        inputs = self.tokenizer(captions, return_tensors="pt", max_length=32, truncation=True, padding='max_length')
        input_ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        attention_mask = inputs["attention_mask"]

        emotions_string = self.df['emotions'][idx]
        emotions_list = eval(emotions_string)
        one_hot_encoded = torch.zeros(len(label_map))
        for emotion in emotions_list:
            if emotion in label_map:
                idx = label_map[emotion]
                one_hot_encoded[idx] = 1

        labels = one_hot_encoded

        return (input_ids, token_type_ids, attention_mask,
                visual_embeds, visual_token_type_ids, visual_attention_mask,
                labels)

In [19]:
label_map = {'curious': 0,
 'amazed': 1,
 'fear': 2,
 'awe': 3,
 'neutral': 4,
 'disgusted': 5,
 'worried': 6,
 'intrigued': 7,
 'confused': 8,
 'beautiful': 9,
 'happy': 10,
 'annoyed': 11,
 'impressed': 12,
 'sad': 13,
 'proud': 14,
 'inspired': 15,
 'angry': 16,
 'excited': 17,
 'nostalgic': 18,
 'upset': 19,
 'concerned': 20,
 'good': 21,
 'hopeful': 22,
 'anger': 23,
 'joy': 24,
 'interested': 25,
 'calm': 26,
 'bored': 27,
 'scared': 28}

In [20]:
images_path = './images'
dataset_B = DatasetB(df, images_path)
dataloader_B = DataLoader(dataset_B, batch_size=64, shuffle=False)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/ota231/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [21]:
dataset_B[0]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


(tensor([[ 101, 2711, 2006, 2143, 2839, 2096, 3403, 2005, 3185, 2000, 2272, 2041,
           102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[[6.1835e-02, 0.0000e+00, 3.7392e-01,  ..., 0.0000e+00,
           2.6447e-01, 1.6728e+00],
          [0.0000e+00, 0.0000e+00, 1.1359e-01,  ..., 1.3758e-02,
           0.0000e+00, 2.9012e-01],
          [0.0000e+00, 0.0000e+00, 1.4732e-01,  ..., 0.0000e+00,
           8.1740e-04, 2.3377e+00],
          ...,
          [0.0000e+00, 5.5369e-02, 7.3985e-02,  ..., 2.1606e-01,
           2.9640e-01, 1.2079e-02],
          [0.0000e+00, 0.0000e+00, 1.1523e-01,  ..., 0.0000e+00,
           2.5727e+00, 8.4765e-01],
          [3.3876

## Transfer Learning: Account for Architectural Differences

In [22]:
# Load pre-trained weights for both student (model A) and teacher (model B)
model_A = SimpleMultimodalModel(num_classes=num_emotions)
model_A.load_state_dict(torch.load(student_model_path))

model_B = TeacherModel(visualbert_model)
model_B.load_state_dict(torch.load(teacher_model_path))

<All keys matched successfully>

In [23]:
model_A.fc1.weight.data.shape

torch.Size([512, 320])

In [24]:
model_A.fc2.weight.data.shape

torch.Size([29, 512])

In [25]:
# transpose first
modified_fc1_weights = model_A.fc1.weight.data.T.mm(model_A.fc2.weight.data.T)
modified_fc1_bias = model_A.fc2.bias.data.unsqueeze(0)

In [26]:
modified_fc1_weights.shape, modified_fc1_bias.shape

(torch.Size([320, 29]), torch.Size([1, 29]))

In [27]:
model_B.fc.weight.data.shape, model_B.fc.bias.data.shape

(torch.Size([29, 768]), torch.Size([29]))

In [28]:
# Perform linear transformation to match dimensions of fc in model B
transformed_weights = model_B.fc.weight.data.T.mm(modified_fc1_weights.T).mm(modified_fc1_weights)

In [29]:
transformed_weights.shape, modified_fc1_bias.shape

(torch.Size([768, 29]), torch.Size([1, 29]))

In [30]:
with torch.no_grad():
    model_B.fc.weight.data.copy_(transformed_weights.T)
    model_B.fc.bias.data.copy_(modified_fc1_bias.data.squeeze())

In [31]:
learning_rate = 0.1
num_epochs = 1

## Transfer Learning AnB

In [32]:
# Freeze pre-trained layers except the final linear layer of the teacher model (B)
for param in model_B.parameters():
    param.requires_grad = False
model_B.fc.weight.requires_grad = True
model_B.fc.bias.requires_grad = True

In [33]:
optimizer = torch.optim.Adam(model_B.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

In [34]:
from tqdm import tqdm

In [36]:
# Training loop
for epoch in range(num_epochs):
    model_B.train()
    running_loss = 0.
    
    # Wrap the dataloader_A with tqdm for progress bar
    with tqdm(total=len(dataloader_A), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for batch in dataloader_A:
            input_ids, attention_mask, images, labels = batch
            optimizer.zero_grad()

            # Forward pass
            probabilities = model_B(input_ids,  attention_mask, images)

            # Calculate loss
            loss = criterion(probabilities, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            
            pbar.update(1)  # Update tqdm progress bar

    # Calculate average training loss
    epoch_loss = running_loss / len(dataloader_A.dataset)
    
    print(f'Train Loss: {epoch_loss:.4f}')

Epoch 1/1:   0%|                                 | 0/185 [00:10<?, ?batch/s]


TypeError: TeacherModel.forward() missing 4 required positional arguments: 'visual_embeds', 'visual_token_type_ids', 'visual_attention_mask', and 'labels'

In [None]:
torch.save(model_B.state_dict(), 'transfer_anb.pth')