In [1]:
import torch
from transformers import GPT2LMHeadModel, RobertaForMaskedLM, AutoModel, RobertaModel,RobertaConfig

In [2]:
model = RobertaForMaskedLM.from_pretrained("../input/roberta-transformers-pytorch/roberta-base")
for name, layer in model.named_modules():
    print(name)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../input/roberta-transformers-pytorch/roberta-base and are newly initialized: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



roberta
roberta.embeddings
roberta.embeddings.word_embeddings
roberta.embeddings.position_embeddings
roberta.embeddings.token_type_embeddings
roberta.embeddings.LayerNorm
roberta.embeddings.dropout
roberta.encoder
roberta.encoder.layer
roberta.encoder.layer.0
roberta.encoder.layer.0.attention
roberta.encoder.layer.0.attention.self
roberta.encoder.layer.0.attention.self.query
roberta.encoder.layer.0.attention.self.key
roberta.encoder.layer.0.attention.self.value
roberta.encoder.layer.0.attention.self.dropout
roberta.encoder.layer.0.attention.output
roberta.encoder.layer.0.attention.output.dense
roberta.encoder.layer.0.attention.output.LayerNorm
roberta.encoder.layer.0.attention.output.dropout
roberta.encoder.layer.0.intermediate
roberta.encoder.layer.0.intermediate.dense
roberta.encoder.layer.0.intermediate.intermediate_act_fn
roberta.encoder.layer.0.output
roberta.encoder.layer.0.output.dense
roberta.encoder.layer.0.output.LayerNorm
roberta.encoder.layer.0.output.dropout
roberta.encod

In [3]:
model = RobertaModel.from_pretrained("../input/roberta-transformers-pytorch/roberta-base")
for name, layer in model.named_modules():
    print(name)
state_dict = model.state_dict()
compressed_sd = {}


embeddings
embeddings.word_embeddings
embeddings.position_embeddings
embeddings.token_type_embeddings
embeddings.LayerNorm
embeddings.dropout
encoder
encoder.layer
encoder.layer.0
encoder.layer.0.attention
encoder.layer.0.attention.self
encoder.layer.0.attention.self.query
encoder.layer.0.attention.self.key
encoder.layer.0.attention.self.value
encoder.layer.0.attention.self.dropout
encoder.layer.0.attention.output
encoder.layer.0.attention.output.dense
encoder.layer.0.attention.output.LayerNorm
encoder.layer.0.attention.output.dropout
encoder.layer.0.intermediate
encoder.layer.0.intermediate.dense
encoder.layer.0.intermediate.intermediate_act_fn
encoder.layer.0.output
encoder.layer.0.output.dense
encoder.layer.0.output.LayerNorm
encoder.layer.0.output.dropout
encoder.layer.1
encoder.layer.1.attention
encoder.layer.1.attention.self
encoder.layer.1.attention.self.query
encoder.layer.1.attention.self.key
encoder.layer.1.attention.self.value
encoder.layer.1.attention.self.dropout
encoder.

In [4]:
#  prefix = "roberta"
compressed_sd['embeddings.position_ids'] = state_dict['embeddings.position_ids']
for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
    param_name = f"embeddings.{w}.weight"
    compressed_sd[param_name] = state_dict[param_name]
for w in ["weight", "bias"]:
    param_name = f"embeddings.LayerNorm.{w}"
    compressed_sd[param_name] = state_dict[param_name]

In [5]:
std_idx = 0
for teacher_idx in [0, 2, 4, 7, 9, 11]:
    for layer in ["attention.self.query","attention.self.key","attention.self.value","attention.output.dense","attention.output.LayerNorm","intermediate.dense","output.dense","output.LayerNorm"]:
        for w in ["weight", "bias"]:
            compressed_sd[f"encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[f"encoder.layer.{teacher_idx}.{layer}.{w}"]
    std_idx += 1

In [6]:
for layer in ["pooler.dense.weight", "pooler.dense.bias"]:
    compressed_sd[f"{layer}"] = state_dict[f"{layer}"]


In [7]:
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")

Number of params transferred for distillation: 104


In [8]:
compressed_sd.keys()

dict_keys(['embeddings.position_ids', 'embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer

In [9]:
torch.save(compressed_sd, 'roberta_base_6layers.pth')

In [10]:
config = RobertaConfig.from_pretrained('roberta-base',num_hidden_layers = 6)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.4",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [12]:
model_student = AutoModel.from_config(config = config)

In [13]:
for name, layer in model_student.named_modules():
    print(name)


embeddings
embeddings.word_embeddings
embeddings.position_embeddings
embeddings.token_type_embeddings
embeddings.LayerNorm
embeddings.dropout
encoder
encoder.layer
encoder.layer.0
encoder.layer.0.attention
encoder.layer.0.attention.self
encoder.layer.0.attention.self.query
encoder.layer.0.attention.self.key
encoder.layer.0.attention.self.value
encoder.layer.0.attention.self.dropout
encoder.layer.0.attention.output
encoder.layer.0.attention.output.dense
encoder.layer.0.attention.output.LayerNorm
encoder.layer.0.attention.output.dropout
encoder.layer.0.intermediate
encoder.layer.0.intermediate.dense
encoder.layer.0.intermediate.intermediate_act_fn
encoder.layer.0.output
encoder.layer.0.output.dense
encoder.layer.0.output.LayerNorm
encoder.layer.0.output.dropout
encoder.layer.1
encoder.layer.1.attention
encoder.layer.1.attention.self
encoder.layer.1.attention.self.query
encoder.layer.1.attention.self.key
encoder.layer.1.attention.self.value
encoder.layer.1.attention.self.dropout
encoder.

In [14]:
param_size = 0
for param in model_student.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 313.265MB


In [15]:
model_student.load_state_dict(compressed_sd)

<All keys matched successfully>

In [16]:
model_student.save_pretrained('roberta_base_6layers_student', from_pt=True)

In [17]:
model_check = RobertaForMaskedLM.from_pretrained('../working/roberta_base_6layers_student',config = config )

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../working/roberta_base_6layers_student and are newly initialized: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
!ls ../working/roberta_base_6layers_student

config.json  pytorch_model.bin
