In [1]:
from transformers import AutoModel , AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "models/bert-base-uncased"

In [3]:
model = AutoModel.from_pretrained(model_name)
classifier_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
classifier_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
for name , param in model.named_parameters():
    print(name , "->" , param.numel())

embeddings.word_embeddings.weight -> 23440896
embeddings.position_embeddings.weight -> 393216
embeddings.token_type_embeddings.weight -> 1536
embeddings.LayerNorm.weight -> 768
embeddings.LayerNorm.bias -> 768
encoder.layer.0.attention.self.query.weight -> 589824
encoder.layer.0.attention.self.query.bias -> 768
encoder.layer.0.attention.self.key.weight -> 589824
encoder.layer.0.attention.self.key.bias -> 768
encoder.layer.0.attention.self.value.weight -> 589824
encoder.layer.0.attention.self.value.bias -> 768
encoder.layer.0.attention.output.dense.weight -> 589824
encoder.layer.0.attention.output.dense.bias -> 768
encoder.layer.0.attention.output.LayerNorm.weight -> 768
encoder.layer.0.attention.output.LayerNorm.bias -> 768
encoder.layer.0.intermediate.dense.weight -> 2359296
encoder.layer.0.intermediate.dense.bias -> 3072
encoder.layer.0.output.dense.weight -> 2359296
encoder.layer.0.output.dense.bias -> 768
encoder.layer.0.output.LayerNorm.weight -> 768
encoder.layer.0.output.LayerNo

In [7]:
# 计算参数量
total_params = 0
# 可学习的参数
total_learnable_params = 0
# embeddings 参数统计
total_embedding_params = 0
# encoder layers 参数统计
total_encoder_params = 0
# pooler 参数统计
total_pooler_params = 0

for name , param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad == True:
        total_learnable_params += param.numel()
    if "embedding" in name:
        total_embedding_params += param.numel()
    if "encoder" in name:
        total_encoder_params += param.numel()
    if "pooler" in name:
        total_pooler_params += param.numel()

print(f"{total_params=}")
print(f"{total_learnable_params=}")
print(f"{total_embedding_params=}")
print(f"{total_encoder_params=}")
print(f"{total_pooler_params=}")

# 计算占比
params = {"total_embedding_params" : total_embedding_params, "total_encoder_params": total_encoder_params , "total_pooler_params": total_pooler_params}
all_params_num = sum(params.values())
for name , param in params.items():
    print(name , param / all_params_num)

total_params=109482240
total_learnable_params=109482240
total_embedding_params=23837184
total_encoder_params=85054464
total_pooler_params=590592
total_embedding_params 0.21772649152958506
total_encoder_params 0.776879099295009
total_pooler_params 0.005394409175405983
