In [None]:
from pathlib import Path
import sys
from torchinfo import summary
import torch
from pathlib import Path

import torch
from src.textclf_transformer.training.training_loop import TrainingLoop
from src.textclf_transformer import *


ROOT = Path().resolve().parents[0]  # one level up from the notebook folder
sys.path.insert(0, str(ROOT))


In [85]:

EXP_BASE = Path(f'{ROOT}/experiments/pretraining/')
name = 'E3_pretraining_imdb_autotinys1_mha'


exp_dir, cfg = read_experiment_config(EXP_BASE, name)
set_global_seed(cfg["experiment"].get("seed", 42))
wrapper = load_tokenizer_wrapper_from_cfg(cfg["tokenizer"])
arch_kw = arch_kwargs_from_cfg(cfg, wrapper.tokenizer) 
head = cfg["mlm_head"]        


In [86]:
model = TransformerForMaskedLM(
    **arch_kw,
    tie_mlm_weights=head["tie_mlm_weights"]
    )

In [87]:


model.eval()

B, N = 3, 100
vocab_size, pad_id = arch_kw['vocab_size'], arch_kw['pad_token_id']
device = "cpu"  

input_ids = torch.randint(1, vocab_size, (B, N), dtype=torch.long)
input_ids[:, -4:] = pad_id  
inputs = {
    "input_ids": input_ids,
    "attention_mask": (input_ids == pad_id),            
}

info = summary(
    model,
    input_data=inputs,     
    device=device,
    depth=5,
    #col_names=("input_size", "output_size", "num_params", "mult_adds"),
    col_names=("num_params",),
    verbose=1,
    return_sequence=False,  
)

Layer (type:depth-idx)                             Param #
TransformerForMaskedLM                             --
├─TransformerTextEmbeddings: 1-1                   --
│    └─Embedding: 2-1                              17,214,408
│    └─LayerNorm: 2-2                              1,128
│    └─Dropout: 2-3                                --
├─ModuleList: 1-2                                  --
│    └─TransformerEncoderBlock: 2-4                --
│    │    └─AttentionBlock: 3-1                    --
│    │    │    └─MultiheadSelfAttention: 4-1       --
│    │    │    │    └─Linear: 5-1                  867,840
│    │    │    │    └─Linear: 5-2                  289,332
│    │    │    │    └─Dropout: 5-3                 --
│    │    │    └─LayerNorm: 4-2                    1,128
│    │    └─MLPBlock: 3-2                          --
│    │    │    └─Sequential: 4-3                   --
│    │    │    │    └─Linear: 5-4                  595,510
│    │    │    │    └─GELU: 5-5                 

In [89]:
EXP_BASE = Path(f'{ROOT}/experiments/finetuning/')
name = 'E3_finetuning_imdb_autotinys1_mha'


exp_dir, cfg = read_experiment_config(EXP_BASE, name)
set_global_seed(cfg["experiment"].get("seed", 42))
wrapper = load_tokenizer_wrapper_from_cfg(cfg["tokenizer"])
arch_kw = arch_kwargs_from_cfg(cfg, wrapper.tokenizer) 
head = cfg['classification_head']

In [90]:
model = TransformerForSequenceClassification(
    **arch_kw,
    num_labels = head['num_labels'],
    classifier_dropout = head['classifier_dropout'],
    pooling = head['pooling'],
    pooler_type = head['pooler_type'],
    )

In [91]:

model.eval()

B, N = 3, 100
vocab_size, pad_id = arch_kw['vocab_size'], arch_kw['pad_token_id']
device = "cpu"  

input_ids = torch.randint(1, vocab_size, (B, N), dtype=torch.long)
input_ids[:, -4:] = pad_id  
inputs = {
    "input_ids": input_ids,
    "attention_mask": (input_ids == pad_id),            
}

info = summary(
    model,
    input_data=inputs,     
    device=device,
    depth=3,
    #col_names=("input_size", "output_size", "num_params", "mult_adds"),
    col_names=("num_params",),
    verbose=1,
    return_sequence=False,  
)

Layer (type:depth-idx)                             Param #
TransformerForSequenceClassification               --
├─TransformerTextEmbeddings: 1-1                   --
│    └─Embedding: 2-1                              17,214,408
│    └─LayerNorm: 2-2                              1,128
│    └─Dropout: 2-3                                --
├─ModuleList: 1-2                                  --
│    └─TransformerEncoderBlock: 2-4                --
│    │    └─AttentionBlock: 3-1                    1,158,300
│    │    └─MLPBlock: 3-2                          1,191,658
│    └─TransformerEncoderBlock: 2-5                --
│    │    └─AttentionBlock: 3-3                    1,158,300
│    │    └─MLPBlock: 3-4                          1,191,658
│    └─TransformerEncoderBlock: 2-6                --
│    │    └─AttentionBlock: 3-5                    1,158,300
│    │    └─MLPBlock: 3-6                          1,191,658
│    └─TransformerEncoderBlock: 2-7                --
│    │    └─AttentionBlo