In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
import wandb
from tabulate import tabulate

# Model Exploration
##### ðŸ§  Full Layer Tree â€“ `RobertaForSequenceClassification` *(BERTweet-style)*

<pre>
RobertaForSequenceClassification
â”œâ”€â”€ roberta : RobertaModel
â”‚   â”œâ”€â”€ embeddings : RobertaEmbeddings
â”‚   â”‚   â”œâ”€â”€ word_embeddings         : Embedding(64001, 768)
â”‚   â”‚   â”œâ”€â”€ position_embeddings     : Embedding(130, 768)
â”‚   â”‚   â”œâ”€â”€ token_type_embeddings   : Embedding(1, 768)
â”‚   â”‚   â”œâ”€â”€ LayerNorm               : LayerNorm((768,))
â”‚   â”‚   â””â”€â”€ dropout                 : Dropout(p=0.1)
â”‚   â””â”€â”€ encoder : RobertaEncoder
â”‚       â””â”€â”€ layer : ModuleList (12 Ã— RobertaLayer)
â”‚           â”œâ”€â”€ attention : RobertaAttention
â”‚           â”‚   â”œâ”€â”€ self : RobertaSdpaSelfAttention
â”‚           â”‚   â”‚   â”œâ”€â”€ query       : Linear(768 â†’ 768)
â”‚           â”‚   â”‚   â”œâ”€â”€ key         : Linear(768 â†’ 768)
â”‚           â”‚   â”‚   â”œâ”€â”€ value       : Linear(768 â†’ 768)
â”‚           â”‚   â”‚   â””â”€â”€ dropout     : Dropout(p=0.1)
â”‚           â”‚   â””â”€â”€ output : RobertaSelfOutput
â”‚           â”‚       â”œâ”€â”€ dense       : Linear(768 â†’ 768)     ðŸŸ¢ attention.output.dense
â”‚           â”‚       â”œâ”€â”€ LayerNorm   : LayerNorm((768,))
â”‚           â”‚       â””â”€â”€ dropout     : Dropout(p=0.1)
â”‚           â”œâ”€â”€ intermediate : RobertaIntermediate
â”‚           â”‚   â”œâ”€â”€ dense       : Linear(768 â†’ 3072)
â”‚           â”‚   â””â”€â”€ activation  : GELU
â”‚           â””â”€â”€ output : RobertaOutput
â”‚               â”œâ”€â”€ dense       : Linear(3072 â†’ 768)        ðŸ”µ feedforward output.dense
â”‚               â”œâ”€â”€ LayerNorm   : LayerNorm((768,))
â”‚               â””â”€â”€ dropout     : Dropout(p=0.1)
â”œâ”€â”€ classifier : RobertaClassificationHead
â”‚   â”œâ”€â”€ dense       : Linear(768 â†’ 768)
â”‚   â”œâ”€â”€ dropout     : Dropout(p=0.1)
â”‚   â””â”€â”€ out_proj    : Linear(768 â†’ 2)
</pre>

In [2]:
# Load the BERTweet model
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2)

# Description for each section
descriptions = {
    "embeddings": "Token, position & type embeddings",
    "encoder": "12-layer Transformer blocks",
    "classifier": "Dense layers for classification",
    "others": "Extra/unclassified modules",
    "total": "Sum of all trainable params"
}

# Initialize counters
param_counts = {k: 0 for k in descriptions}

# Count trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        n = param.numel()
        if "embeddings" in name:
            param_counts["embeddings"] += n
        elif "encoder" in name:
            param_counts["encoder"] += n
        elif "classifier" in name:
            param_counts["classifier"] += n
        else:
            param_counts["others"] += n
        param_counts["total"] += n

# Print the formatted table
print()
print(f"{'Section':<12} | {'Params':>15} | {'% of Total':>10} | Description")
print("-" * 65)
for section, count in param_counts.items():
    percentage = (count / param_counts["total"]) * 100 if section != "total" else 100
    print(f"{section:<12} | {count:>15,} | {percentage:>9.2f}% | {descriptions[section]}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Section      |          Params | % of Total | Description
-----------------------------------------------------------------
embeddings   |      49,254,912 |     36.51% | Token, position & type embeddings
encoder      |      85,054,464 |     63.05% | 12-layer Transformer blocks
classifier   |         592,130 |      0.44% | Dense layers for classification
others       |               0 |      0.00% | Extra/unclassified modules
total        |     134,901,506 |    100.00% | Sum of all trainable params
