In [1]:
import os
os.environ['HF_ENDPOINT']='https://hf-mirror.com'
os.environ['CUDA_VISIBLE_DEVICES']='1'
import torch
import datasets
import transformers
from transformers import AutoModelForSequenceClassification
from nn_pruning.patch_coordinator import ModelPatchingCoordinator
from nn_pruning.model_patcher import ModelPatcher
import torch
from nn_pruning.patch_coordinator import SparseTrainingArguments

datasets.logging.set_verbosity_error()
transformers.logging.set_verbosity_error()
print(f"Using transformers v{transformers.__version__} and datasets v{datasets.__version__} and torch v{torch.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


Using transformers v4.34.0 and datasets v3.0.1 and torch v2.0.1+cu117


In [4]:
model_name = "t5-small"
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)



In [5]:
sparse_args = SparseTrainingArguments()
sparse_args

SparseTrainingArguments(mask_scores_learning_rate=0.01, dense_pruning_method='topK', attention_pruning_method='topK', ampere_pruning_method='disabled', attention_output_with_dense=True, bias_mask=True, mask_init='constant', mask_scale=0.0, dense_block_rows=1, dense_block_cols=1, attention_block_rows=1, attention_block_cols=1, initial_threshold=1.0, final_threshold=0.5, initial_warmup=1, final_warmup=2, initial_ampere_temperature=0.0, final_ampere_temperature=20.0, regularization='disabled', regularization_final_lambda=0.0, attention_lambda=1.0, dense_lambda=1.0, decoder_attention_lambda=None, decoder_dense_lambda=None, distil_teacher_name_or_path=None, distil_alpha_ce=0.5, distil_alpha_teacher=0.5, distil_temperature=2.0, final_finetune=False, layer_norm_patch=False, layer_norm_patch_steps=50000, layer_norm_patch_start_delta=0.99, gelu_patch=False, gelu_patch_steps=50000, linear_min_parameters=0.005, rewind_model_name_or_path=None, eval_with_current_patch_params=False, qat=False, qconf

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


mpc = ModelPatchingCoordinator(
    sparse_args=sparse_args, 
    device=device, 
    cache_dir="checkpoints", 
    logit_names="logits", 
    model_name_or_path=model_name,
    teacher_constructor=None)

# 对模型进行补丁
patched_model = mpc.patch_model(model)

# 保存补丁后的模型
# output_dir = "path/to/save/patched_t5_small"
# patched_model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# print(f"Patched model saved to {output_dir}")



{'transformer\\.encoder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.q': [Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False)], 'transformer\\.encoder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.k': [Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, bias=False)], 'transformer\\.encoder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.v': [Linear(in_features=512, out_features=512, bias=False), Linear(in_features=512, out_features=512, 

In [10]:
mpc.model_structure.ATTENTION_LAYERS
patterns = []
layer_types = ['query',
            'key',
            'value',
            'att_dense',
            'encoder_decoder_query',
            'encoder_decoder_key',
            'encoder_decoder_value',
            'encoder_decoder_att_dense',
            'interm_dense',
            'output_dense='
        ]
for layer_type in layer_types:
    layer = mpc.model_structure.LAYER_PATTERNS.get(layer_type)
    if layer is not None:
        layer_pattern = (mpc.model_structure.PATTERN_PREFIX + layer).replace(".", "\.")
        patterns.append(layer_pattern)

In [11]:
patterns

['(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.q',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.k',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.v',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.o',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.EncDecAttention\\.q',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.EncDecAttention\\.k',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.EncDecAttention\\.v',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.EncDecAttention\\.o',
 '(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.DenseReluDense\\.wi']

In [12]:
from nn_pruning.training_patcher import LinearModelPatcher
import re

In [None]:
'transformer.encoder.block.0.layer.0.SelfAttention.q'
'(en|de)coder\\.block\\.[0-9]+\\.layer\\.[0-9]+\\.SelfAttention\\.q'

## Datasets

In [2]:
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_sets = load_dataset('glue','mrpc')

In [13]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')
tokenizer

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_i

In [6]:
def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples['sentence1'],) if 'sentence2' == None else (
        examples['sentence1'], examples['sentence2'])
    )
    result = tokenizer(*args, truncation=True, padding='max_length',max_length=128)
    
    return result

In [11]:
mapped_dataset = data_sets.map(preprocess_function,batched=True,)

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


In [14]:
for inputs in mapped_dataset['train']:
    print(inputs)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [736, 9860, 23, 11970, 112, 4284, 3, 6, 4068, 3, 88, 718, 96, 8, 9051, 96, 3, 6, 13, 24067, 1227, 7279, 1222, 112, 2084, 3, 5, 1, 12250, 1007, 12, 376, 38, 163, 96, 8, 9051, 96, 3, 6, 736, 9860, 23, 11970, 112, 4284, 13, 24067, 1227, 7279, 1222, 112, 2084, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
tokenizer('Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .')

{'input_ids': [736, 9860, 23, 11970, 112, 4284, 3, 6, 4068, 3, 88, 718, 96, 8, 9051, 96, 3, 6, 13, 24067, 1227, 7279, 1222, 112, 2084, 3, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}