In [1]:
import pandas as pd
from collections import Counter
from pathlib import Path
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
import torch.nn as nn

INPUT_PATH = Path(f"/mnt/storage_dimm2/kaggle_data/commonlitreadabilityprize")
MODEL_CACHE = Path("/mnt/storage/model_cache/torch")

In [2]:
# https://www.kaggle.com/gogo827jz/roberta-model-parallel-fold-training-on-tpu
class AttentionBlock(nn.Module):
    def __init__(self, in_features, middle_features, out_features):
        super().__init__()
        self.in_features = in_features
        self.middle_features = middle_features
        self.out_features = out_features
        self.W = nn.Linear(in_features, middle_features)
        self.V = nn.Linear(middle_features, out_features)

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector


In [3]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_CACHE)
config = AutoConfig.from_pretrained(model_name)

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     cache_dir=MODEL_CACHE,
# #     num_labels=1,
# #     output_hidden_states=True,
# )

model = AutoModel.from_pretrained(
    model_name,
    cache_dir=MODEL_CACHE,
#     num_labels=1,
    output_hidden_states=True,
)

model

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [4]:
inputs = tokenizer(
    ["Hello, my dog is cute", "My cat is called Annie"],
    return_tensors="pt",
    return_token_type_ids=True,
    max_length=256,
    padding="max_length",
)
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs)

In [5]:
outputs[0].shape, outputs[1].shape

(torch.Size([2, 256, 768]), torch.Size([2, 768]))

In [6]:
pooled_head = nn.Sequential(
    nn.LayerNorm(config.hidden_size),
    nn.Dropout(0.1),
    nn.Linear(config.hidden_size, 1)
)

seq_attn_head = nn.Sequential(
    nn.LayerNorm(config.hidden_size),
    nn.Dropout(0.1),
    AttentionBlock(config.hidden_size, config.hidden_size, 1),
    nn.Dropout(0.1),
    nn.Linear(config.hidden_size, 1),
)

In [7]:
out = pooled_head(outputs[1])
out.shape

torch.Size([2, 1])

In [8]:
out = seq_attn_head(outputs[0])
out.shape

torch.Size([2, 1])

In [9]:
# Convolution along seq?
seq_conv_attn_head = nn.Sequential(
    nn.LayerNorm(config.hidden_size),
    nn.Dropout(0.1),
    nn.Conv1d(256, 128, kernel_size=5, padding=2),
    nn.BatchNorm1d(128),
    nn.Dropout(0.1),
    AttentionBlock(config.hidden_size, config.hidden_size, 1),
    nn.Dropout(0.1),
    nn.Linear(config.hidden_size, 1),
)

In [10]:
out = seq_conv_attn_head(outputs[0])
out.shape

torch.Size([2, 1])

In [11]:
# Mean-max pooling
lin = nn.Linear(config.hidden_size * 2, 1)
do = nn.Dropout(0.5)

out_mean = torch.mean(outputs[0], dim=1)
out_max, _ = torch.max(outputs[0], dim=1)
out = torch.cat((out_mean, out_max), dim=-1)
print(out.shape)

# Multisample Dropout https://github.com/heartkilla/kaggle_tweet/blob/72697f5ba210ca0eebbdc4e8672e66fc08f9c715/src/1st_level/roberta_base/models.py#L30-L34
logits = torch.mean(torch.stack([lin(do(out)) for _ in range(5)], dim=0), dim=0)

out.shape

torch.Size([2, 1536])


torch.Size([2, 1536])

In [12]:
config.num_hidden_layers

12

In [13]:
# Hidden states method from tweet sentiment
hidden_states = outputs[2]
out = torch.stack(tuple(hidden_states[-i - 1] for i in range(config.num_hidden_layers)), dim=0)
out_mean = torch.mean(out, dim=0)
out_max, _ = torch.max(out, dim=0)
out = torch.cat((out_mean, out_max), dim=-1)
out_mean = torch.mean(out, dim=1)
out_max, _ = torch.max(out, dim=1)
out = torch.cat((out_mean, out_max), dim=-1)
out.shape

torch.Size([12, 2, 256, 768])


torch.Size([2, 3072])

In [14]:
3072 / 4

768.0