In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#default_exp model

# Model
> Our project will require the use of 4 separate types of models: an image model, a text model, a tabular model, and a decoder network. The relationship between the 4 can be seen in the figure below. 

![](model_diagram.jpg)

In [None]:
#exporti

from transformers import DistilBertModel, DistilBertTokenizer
from DSAI_proj.dataset import *
from torch import nn
from torch.utils.data import DataLoader
from functools import partial
from PIL import Image
import torchvision.models as models

In [None]:
#exporti

def freeze_all_but_layer(m, layer):
    if not isinstance(m, layer):
        if hasattr(m, 'weight') and m.weight is not None:
            m.weight.requires_grad_(False)
        if hasattr(m, 'bias') and m.bias is not None:
            m.bias.requires_grad_(False)

We first design a cnn_encoder module using a pretrained resnet 18 architecture. We will keep the weights frozen as we do not want them to be updated too much in the training process. We also unfreeze the batchnorm layers, as these have been shown to learn the distributions better when unfrozen during fine-tuning. 

In [None]:
#export

def cnn_encoder(pretrained: bool, in_channels: int, out_channels: int):
    model = models.resnet18(pretrained=pretrained)
    last_layers = [nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1, bias=False), nn.AdaptiveAvgPool2d(1)]
    model = nn.Sequential(*list(model.children())[:-2], *last_layers)
    img_freeze_fn = partial(freeze_all_but_layer, layer=nn.BatchNorm2d)
    model.apply(img_freeze_fn)
    return model

Next, we design a text_encoder module, which will consist of the encoder layers of DistilBert. Similar to the cnn_encoder, we freeze all the layers except the normalization layers, which in this case is LayerNorm. 

In [None]:
#export

def text_encoder(model_type: str):
    model = DistilBertModel.from_pretrained(model_type)
    text_freeze_fn = partial(freeze_all_but_layer, layer=nn.LayerNorm) 
    model.apply(text_freeze_fn)
    return model

We will also need a module for our tabular meta data, and hence use a simple linear layer which will map the input meta data to necessary output shape required. 

In [None]:
#export

def meta_encoder(in_channels: int, out_channels: int):
    model = nn.Linear(in_features=in_channels, out_features=out_channels)
    return model

The last piece of the puzzle is a decoder network that will decode the outputs of the above 3 encoder modules and produce the predicted score where the last dimension represents the vocabulary size of the model. In other words, these are the raw logits distributed across all possible words, and a softmax will be applied to determine the most likely word. For our case, we will use a simple linear layer to act as the decoder layer. 

In [None]:
#export

def decoder(hidden_dim: int, vocab_size: int):
    return nn.Linear(in_features=hidden_dim, out_features=vocab_size)

In [None]:
class TaglinePredictorModel(nn.Module):
    
    def __init__(self, vocab_size: int, meta_features: int):
        super(TaglinePredictorModel, self).__init__()
        self.cnn_encoder = cnn_encoder(pretrained=True, in_channels=512, out_channels=768)
        self.text_encoder = text_encoder(model_type='distilbert-base-uncased')
        self.meta_encoder = meta_encoder(in_channels=meta_features, out_channels=768)
        self.decoder = decoder(hidden_dim=768, vocab_size=vocab_size)
        
    def forward(self, x: dict):
        poster_feature = self.cnn_encoder(x['poster_img']).squeeze(-1).permute(0, 2, 1)
        backdrop_feature = self.cnn_encoder(x['backdrop_img']).squeeze(-1).permute(0, 2, 1)
        text_feature = self.text_encoder(**(x['text_inputs']))
        meta_feature = self.meta_encoder(x['meta']).unsqueeze(1)
        
        poster_feat = self.decoder(poster_feature)
        backdrop_feat = self.decoder(backdrop_feature)
        text_feat = self.decoder(text_feat)
        meta_feat = self.decoder(meta_feature)
        return poster_feat, backdrop_feat, text_feat, meta_feat

The values for this Tagline model are mostly hard-coded as we are limited by architectural choices. As we will be using DistilBert for the text encoder, our hidden dimensions are limited to being 768, with a vocabulary size of 30522. Our choice of architecture for the image encoder is the resnet 18, which has a final output channel dimension of 512.

Now that we are finally finished creating the model class, let's test it out on an example from the dataset created in the previous section. 

In [None]:
# dataset creation
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 80
height = width = 128
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]

In [None]:
tfms = Compose([Tokenize(tokenizer=tokenizer, max_length=max_length),
                         RandomResizeCrop(width=width, height=height, method=Image.BILINEAR),
                         ToTensor(),
                         NormalizeStandardize(mean=mean, std=std)])

poster_img_dir = "poster_img/"
backdrop_img_dir = "backdrop_img/"
train_ds = MovieDataset(poster_img_dir=poster_img_dir,
                        backdrop_img_dir=backdrop_img_dir,
                        ds_type="train",
                        transforms=tfms)

train dataset created!


In [None]:
dls = DataLoader(train_ds, batch_size=1)
sample = next(iter(dls))

In [None]:
sample['text_inputs']

{'input_ids': tensor([[[  101,  1996,  3587,  5783,   102,  1999, 14993,  2581,  1010,  1037,
           10095,  4062,  2003,  4895, 18447,  4765, 19301,  2135,  2445,  1996,
            4708,  1997,  7494,  1037,  2402,  2611,  2040,  2003,  2112,  1997,
            1996,  3145,  2008,  2097,  5676,  1996,  7691,  1997,  8438,  1012,
             102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]]),
 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])}

In [None]:
model = TaglinePredictorModel(vocab_size=30522, meta_features=len(sample['meta']))

In [None]:
res1, res2, res3, res4 = model(sample)
print(res1.shape, res2.shape, res3.shape, res4.shape)

ValueError: too many values to unpack (expected 3)