In [1]:
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import os
# set env first then load other
os.environ['OMP_NUM_THREADS'] = '12'
os.environ['OPENBLAS_NUM_THREADS'] = '12'
os.environ['MKL_NUM_THREADS'] = '12'
os.environ['VECLIB_MAXIMUM_THREADS'] = '12'
os.environ['NUMEXPR_NUM_THREADS'] = '12'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,5,7"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import time
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
from gptqmodel import GPTQModel, QuantizeConfig
import datasets
import accelerate

import logging
def refresh_log():
    if logging.root.handlers:
        logging.info(f"logging already init. reset all")
        for handler in logging.root.handlers[:]:
            print(f"{handler.name}")
            logging.root.removeHandler(handler)
            handler.close()

    
    logger_dict = logging.Logger.manager
    logger_dict.loggerDict = {}

    

    format = "%(levelname)s %(asctime)s.%(msecs)03d [%(process)d-%(threadName)s] (%(funcName)s@%(filename)s:%(lineno)03d) %(message)s"
    datefmt = "%Y-%m-%d %H:%M:%S"
    logging.basicConfig(format=format, datefmt=datefmt, level=logging.INFO, handlers=[logging.StreamHandler()])
    logging.info(f"init log")

refresh_log()

import builtins
def print(*args, **kwargs):
    sep = kwargs.get("sep", " ")  # default separator
    end = kwargs.get("end", "")  # default end
    message = sep.join(map(str, args)) + end
    logging.info(message, stacklevel=3)

builtins.print = print


# pretrained_model_dir = '/aigc-nas02/zpfcode/online_model/xdlovers_72b_0712'
pretrained_model_dir = '/aigc-nas02/hf_models/Qwen1.5-0.5B'
quantized_model_dir = f'{pretrained_model_dir}-gptq-int8'

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)


quantize_config = QuantizeConfig(
    bits=8,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
    damp_percent=0.01,
)

# max_memory={ 0:"10GIB", 1:"64GIB", 2:"64GIB", 3:"64GIB", "cpu":"200GIB"}

# # device_map="balanced_low_0",
# model = GPTQModel.from_pretrained(pretrained_model_dir, quantize_config, trust_remote_code=True,
#                                   max_memory=max_memory,
#                                      attn_implementation="flash_attention_2") 




INFO - start base
INFO 2024-07-17 20:26:58.042 [61754-MainThread] (refresh_log@4181186596.py:039) init log
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (128838 > 32768). Running this sequence through the model will result in indexing errors
INFO 2024-07-17 20:26:59.867 [61754-MainThread] (<module>@4181186596.py:074) load data 300 examples 1.64


In [None]:

examples = []
ds_st = time.perf_counter()
pile_dataset: datasets.Dataset = datasets.load_dataset(r"/aigc-nas02/cyj/pile-val-backup", split='validation')

def split_slow_tokenize(dataset: datasets.Dataset):
    dataset_10k = dataset[:300]
    examples = []
    for data in dataset_10k['text']:
        examples.append(tokenizer(data))
    return examples


examples = split_slow_tokenize(pile_dataset)
logging.info(f"load data {len(examples)} examples {time.perf_counter() - ds_st:.2f}")



In [4]:
# only use 0 gpu_device
model = GPTQModel.from_pretrained(pretrained_model_dir, quantize_config, trust_remote_code=True,
                                  device_map="balanced_low_0") 
refresh_log()

logging.info(f"start quantize")
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
# cache_examples_on_gpu=False
quant_log, quantizers, force_layer_back_to_cpu, device_map, forward_pass_use_cache = model.quantize(examples, batch_size=1, calibration_enable_gpu_cache=False)


INFO 2024-07-17 20:20:43.803 [12438-MainThread] (<module>@2049556245.py:001) start quantize
INFO - start quant
Quantizing self_attn.k_proj in layer 1 of 24:   0%|          | 0/24 [00:01<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.k_proj', 'avg_loss': '0.0039', 'time': '0.2490'}
Quantizing self_attn.v_proj in layer 1 of 24:   0%|          | 0/24 [00:01<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.v_proj', 'avg_loss': '0.0001', 'time': '0.2508'}
Quantizing self_attn.q_proj in layer 1 of 24:   0%|          | 0/24 [00:01<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.q_proj', 'avg_loss': '0.0058', 'time': '0.2535'}
Quantizing self_attn.o_proj in layer 1 of 24:   0%|          | 0/24 [00:03<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.o_proj', 'avg_loss': '0.0000', 'time': '0.2501'}
Quantizing mlp.up_proj in layer 1 of 24:   0%|          | 0/24 [00:04<?, ?it/s]     INFO - {'layer': 1, 'module': 'mlp.up_proj', 'avg_loss': '0.0183', 'time': '0.2542'}
Quantizing mlp.gate_proj

In [5]:
# save all varabiles
from joblib import dump, load
dump(model, 'model.joblib')
dump(quant_log, 'quant_log.joblib')
dump(quantizers, 'quantizers.joblib')
dump(force_layer_back_to_cpu, 'force_layer_back_to_cpu.joblib')
dump(device_map, 'device_map.joblib')
dump(forward_pass_use_cache, 'forward_pass_use_cache.joblib')

['forward_pass_use_cache.joblib']

In [2]:
from joblib import dump, load
model = load('model.joblib')
quant_log = load('quant_log.joblib')
quantizers = load('quantizers.joblib')
force_layer_back_to_cpu = load('force_layer_back_to_cpu.joblib')
device_map = load('device_map.joblib')
forward_pass_use_cache = load('forward_pass_use_cache.joblib')

In [3]:
logging.info(f"start pack")
model.pack(quant_log, quantizers, force_layer_back_to_cpu, None, forward_pass_use_cache)

INFO 2024-07-17 20:27:58.837 [61754-MainThread] (<module>@3912912720.py:001) start pack
INFO 2024-07-17 20:27:58.838 [61754-MainThread] (select_quant_linear@importer.py:051) Auto choose the fastest one based on quant model compatibility: <class 'gptqmodel.nn_modules.qlinear.qlinear_tritonv2.TritonV2QuantLinear'>
INFO - Packing model...
INFO 2024-07-17 20:27:58.841 [61754-MainThread] (pack_model@model.py:278) Packing model...
INFO 2024-07-17 20:27:58.842 [61754-MainThread] (select_quant_linear@importer.py:051) Auto choose the fastest one based on quant model compatibility: <class 'gptqmodel.nn_modules.qlinear.qlinear_tritonv2.TritonV2QuantLinear'>
  0%|          | 0/168 [00:00<?, ?it/s]INFO 2024-07-17 20:28:04.998 [61754-MainThread] (pack_model@model.py:299) Processed model.layers.0.self_attn.o_proj to None
  1%|          | 1/168 [00:05<15:54,  5.71s/it]INFO 2024-07-17 20:28:05.052 [61754-MainThread] (pack_model@model.py:299) Processed model.layers.2.self_attn.v_proj to None
INFO 2024-0

[{'layer': 1,
  'module': 'self_attn.k_proj',
  'avg_loss': '0.0039',
  'time': '0.2490'},
 {'layer': 1,
  'module': 'self_attn.v_proj',
  'avg_loss': '0.0001',
  'time': '0.2508'},
 {'layer': 1,
  'module': 'self_attn.q_proj',
  'avg_loss': '0.0058',
  'time': '0.2535'},
 {'layer': 1,
  'module': 'self_attn.o_proj',
  'avg_loss': '0.0000',
  'time': '0.2501'},
 {'layer': 1, 'module': 'mlp.up_proj', 'avg_loss': '0.0183', 'time': '0.2542'},
 {'layer': 1,
  'module': 'mlp.gate_proj',
  'avg_loss': '0.0249',
  'time': '0.2524'},
 {'layer': 1,
  'module': 'mlp.down_proj',
  'avg_loss': '0.0009',
  'time': '0.7110'},
 {'layer': 2,
  'module': 'self_attn.k_proj',
  'avg_loss': '0.0487',
  'time': '0.2542'},
 {'layer': 2,
  'module': 'self_attn.v_proj',
  'avg_loss': '0.0056',
  'time': '0.2556'},
 {'layer': 2,
  'module': 'self_attn.q_proj',
  'avg_loss': '0.0434',
  'time': '0.2489'},
 {'layer': 2,
  'module': 'self_attn.o_proj',
  'avg_loss': '0.0001',
  'time': '0.2569'},
 {'layer': 2, 'm

In [4]:
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment.  If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001

In [27]:
from torch import device
model.to(device("cpu"))
import copy
model.load_state_dict(copy.deepcopy(model.state_dict()))

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment.  If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001

In [22]:
def shallow_copy(x: dict):
    c = {}
    for k,v in x.items():
        c[k] = clone_if_tensor(v)
    return c


def clone_if_tensor(value):
    import torch
    if isinstance(value, torch.Tensor):
        return value.clone()
    elif isinstance(value, list):
        return [clone_if_tensor(v) for v in value]
    elif isinstance(value, dict):
        return {k: clone_if_tensor(v) for k, v in value.items()}
    elif isinstance(value, (int, float, bool, str)):
        return copy.deepcopy(value)
    else:
        return value

In [30]:
model.model.load_state_dict(shallow_copy(model.model.state_dict()))
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment.  If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001

In [29]:
import torch
import torch.nn as nn
import torch.nn.utils as utils

# model = utils.weight_norm(model.state_dict())
model2 = utils.weight_norm(model.model)

# 在进行深拷贝前移除权重规范化
utils.remove_weight_norm(model2)

# 进行深拷贝
model_copy = copy.deepcopy(model2)

# 重新应用权重规范化
model_copy = utils.weight_norm(model_copy)

AttributeError: 'Qwen2ForCausalLM' object has no attribute 'weight'