In [1]:
!pip install llmcompressor

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting llmcompressor
  Downloading http://mirrors.aliyun.com/pypi/packages/9c/dd/953e05c25aed8a92355688bf118e9a5327b9a257a7d69ae5ac6ffcbf4af3/llmcompressor-0.4.1-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.1/255.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting loguru (from llmcompressor)
  Downloading http://mirrors.aliyun.com/pypi/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting pynvml (from llmcompressor)
  Downloading http://mirrors.aliyun.com/pypi/packages/ed/df/f7cf07a65a96dd11d71f346f9c2863accdd4784da83af7181b067d556cbc/pynvml-12.0.0-py3-none-any.whl (26 kB)
Collecting compressed-tensors==0.9.2 (from llmcompressor)
  Downloading http://mirrors.

### Loading the Model

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "/root/autodl-fs/data2/anti_fraud/models/modelscope/hub/hub/Qwen/Qwen2-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map="auto", torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Preparing Calibration Data

In [4]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [5]:
from datasets import load_dataset

NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load and preprocess the dataset
# TODO use chinese dataset
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(preprocess)

def tokenize(sample):
    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

(…)-00000-of-00003-a3ecf92756993583.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00001-of-00003-0a1804bcb6ae68c6.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00002-of-00003-ee46ed25cfae92c6.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00000-of-00001-f7dfac4afe5b93f4.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

(…)-00000-of-00003-a6c9fb894be3e50b.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00001-of-00003-d6a0402e417f35ca.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

(…)-00002-of-00003-c0db75b92a2f48fd.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

(…)-00000-of-00001-3d4cd8309148a71f.parquet:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

### Applying Quantization

In [6]:
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

# Configure the quantization algorithms
recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]

# Apply quantization
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save the compressed model
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

2025-03-14T12:44:06.466857+0800 | main | INFO - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_oneshot=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_conca

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


2025-03-14T12:44:06.954450+0800 | one_shot | INFO - *** One Shot ***
2025-03-14T12:44:06.962598+0800 | from_modifiers | INFO - Creating recipe from modifiers
2025-03-14T12:44:07.000031+0800 | _check_compile_recipe | INFO - Recipe compiled and 1 modifiers created
2025-03-14T12:44:07.000970+0800 | _infer_mappings_from_model | INFO - No SmoothQuantModifier.mappings provided, inferring from model...
2025-03-14T12:44:07.976885+0800 | _calibrate | INFO - Running SmoothQuantModifier calibration with 512 samples...


100%|██████████| 512/512 [00:52<00:00,  9.78it/s]

2025-03-14T12:45:00.351688+0800 | _apply_smoothing | INFO - Smoothing activation scales...





2025-03-14T12:45:00.552593+0800 | _build_quant_modifier | INFO - Building quantization modifier with args: {'targets': 'Linear', 'scheme': 'W8A8', 'ignore': ['lm_head']}
2025-03-14T12:45:00.582753+0800 | _check_calibration_data | INFO - Skipping QuantizationModifier calibration, it is not required for the provided quantization config.


Preparing intermediates cache: 100%|██████████| 512/512 [00:00<00:00, 714.36it/s]
(1/29): Calibrating: 100%|██████████| 512/512 [00:30<00:00, 16.56it/s]

2025-03-14T12:45:32.382909+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.0.self_attn.q_proj using 512 samples





2025-03-14T12:45:34.215049+0800 | compress | METRIC - time 1.83s
2025-03-14T12:45:34.217457+0800 | compress | METRIC - error 55.28
2025-03-14T12:45:34.219631+0800 | compress | METRIC - GPU 0 | usage: 25.86% | total memory: 85 GB
2025-03-14T12:45:34.220615+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:45:34.221700+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.0.self_attn.k_proj using 512 samples
2025-03-14T12:45:35.342784+0800 | compress | METRIC - time 1.12s
2025-03-14T12:45:35.344348+0800 | compress | METRIC - error 6.69
2025-03-14T12:45:35.345446+0800 | compress | METRIC - GPU 0 | usage: 25.86% | total memory: 85 GB
2025-03-14T12:45:35.346511+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:45:35.348842+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.0.self_attn.v_proj using 512 samples
2025-03-14T12:45:36.661451+0800 | compress | METRIC - time 1.31s
2025-03-14T12:45:36.663078+0800 | compres

(1/29): Propagating: 100%|██████████| 512/512 [00:06<00:00, 81.05it/s]
(2/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.83it/s]

2025-03-14T12:46:26.799923+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.1.self_attn.q_proj using 512 samples





2025-03-14T12:46:28.059322+0800 | compress | METRIC - time 1.26s
2025-03-14T12:46:28.060574+0800 | compress | METRIC - error 49.35
2025-03-14T12:46:28.062255+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:46:28.062988+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:46:28.064417+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.1.self_attn.k_proj using 512 samples
2025-03-14T12:46:29.159680+0800 | compress | METRIC - time 1.09s
2025-03-14T12:46:29.161217+0800 | compress | METRIC - error 13.56
2025-03-14T12:46:29.162915+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:46:29.163673+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:46:29.165152+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.1.self_attn.v_proj using 512 samples
2025-03-14T12:46:30.273928+0800 | compress | METRIC - time 1.11s
2025-03-14T12:46:30.275455+0800 | compre

(2/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 144.16it/s]
(3/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.89it/s]

2025-03-14T12:47:17.367342+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.2.self_attn.q_proj using 512 samples





2025-03-14T12:47:18.867027+0800 | compress | METRIC - time 1.50s
2025-03-14T12:47:18.869112+0800 | compress | METRIC - error 119.28
2025-03-14T12:47:18.870739+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:47:18.871437+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:47:18.872803+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.2.self_attn.k_proj using 512 samples
2025-03-14T12:47:19.985461+0800 | compress | METRIC - time 1.11s
2025-03-14T12:47:19.987171+0800 | compress | METRIC - error 33.55
2025-03-14T12:47:19.988781+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:47:19.989544+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:47:19.990903+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.2.self_attn.v_proj using 512 samples
2025-03-14T12:47:21.137716+0800 | compress | METRIC - time 1.15s
2025-03-14T12:47:21.139444+0800 | compr

(3/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 166.22it/s]
(4/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.86it/s]

2025-03-14T12:48:08.198056+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.3.self_attn.q_proj using 512 samples





2025-03-14T12:48:09.475376+0800 | compress | METRIC - time 1.28s
2025-03-14T12:48:09.477127+0800 | compress | METRIC - error 122.86
2025-03-14T12:48:09.478883+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:48:09.479631+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:48:09.481095+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.3.self_attn.k_proj using 512 samples
2025-03-14T12:48:10.583434+0800 | compress | METRIC - time 1.10s
2025-03-14T12:48:10.585202+0800 | compress | METRIC - error 41.19
2025-03-14T12:48:10.586630+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:48:10.587190+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:48:10.588267+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.3.self_attn.v_proj using 512 samples
2025-03-14T12:48:11.717085+0800 | compress | METRIC - time 1.13s
2025-03-14T12:48:11.720544+0800 | compr

(4/29): Propagating: 100%|██████████| 512/512 [00:02<00:00, 173.09it/s]
(5/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.71it/s]

2025-03-14T12:48:58.836995+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.4.self_attn.q_proj using 512 samples





2025-03-14T12:49:00.123529+0800 | compress | METRIC - time 1.28s
2025-03-14T12:49:00.125417+0800 | compress | METRIC - error 280.60
2025-03-14T12:49:00.126918+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:49:00.127655+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:49:00.128990+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.4.self_attn.k_proj using 512 samples
2025-03-14T12:49:01.234345+0800 | compress | METRIC - time 1.10s
2025-03-14T12:49:01.236172+0800 | compress | METRIC - error 82.86
2025-03-14T12:49:01.237883+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:49:01.238656+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:49:01.240112+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.4.self_attn.v_proj using 512 samples
2025-03-14T12:49:02.363235+0800 | compress | METRIC - time 1.12s
2025-03-14T12:49:02.364997+0800 | compr

(5/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 150.56it/s]
(6/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.84it/s]

2025-03-14T12:49:49.435791+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.5.self_attn.q_proj using 512 samples





2025-03-14T12:49:50.699347+0800 | compress | METRIC - time 1.26s
2025-03-14T12:49:50.700792+0800 | compress | METRIC - error 291.09
2025-03-14T12:49:50.701661+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:49:50.702369+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:49:50.704002+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.5.self_attn.k_proj using 512 samples
2025-03-14T12:49:51.801343+0800 | compress | METRIC - time 1.10s
2025-03-14T12:49:51.803142+0800 | compress | METRIC - error 69.55
2025-03-14T12:49:51.804772+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:49:51.805492+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:49:51.806943+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.5.self_attn.v_proj using 512 samples
2025-03-14T12:49:53.036834+0800 | compress | METRIC - time 1.23s
2025-03-14T12:49:53.038688+0800 | compr

(6/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 132.65it/s]
(7/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.90it/s]

2025-03-14T12:50:40.850819+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.6.self_attn.q_proj using 512 samples





2025-03-14T12:50:42.149291+0800 | compress | METRIC - time 1.30s
2025-03-14T12:50:42.151262+0800 | compress | METRIC - error 260.44
2025-03-14T12:50:42.152901+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:50:42.153633+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:50:42.155074+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.6.self_attn.k_proj using 512 samples
2025-03-14T12:50:43.277830+0800 | compress | METRIC - time 1.12s
2025-03-14T12:50:43.279527+0800 | compress | METRIC - error 50.04
2025-03-14T12:50:43.281088+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:50:43.281815+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:50:43.283115+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.6.self_attn.v_proj using 512 samples
2025-03-14T12:50:44.423722+0800 | compress | METRIC - time 1.14s
2025-03-14T12:50:44.425741+0800 | compr

(7/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 134.12it/s]
(8/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.85it/s]

2025-03-14T12:51:32.002550+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.7.self_attn.q_proj using 512 samples





2025-03-14T12:51:33.488839+0800 | compress | METRIC - time 1.49s
2025-03-14T12:51:33.490421+0800 | compress | METRIC - error 405.46
2025-03-14T12:51:33.491958+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:51:33.492635+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:51:33.493945+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.7.self_attn.k_proj using 512 samples
2025-03-14T12:51:34.812181+0800 | compress | METRIC - time 1.32s
2025-03-14T12:51:34.814337+0800 | compress | METRIC - error 75.62
2025-03-14T12:51:34.815389+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:51:34.816887+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:51:34.818087+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.7.self_attn.v_proj using 512 samples
2025-03-14T12:51:36.150376+0800 | compress | METRIC - time 1.33s
2025-03-14T12:51:36.152393+0800 | compr

(8/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 137.64it/s]
(9/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.84it/s]

2025-03-14T12:52:23.639308+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.8.self_attn.q_proj using 512 samples





2025-03-14T12:52:24.926423+0800 | compress | METRIC - time 1.28s
2025-03-14T12:52:24.928193+0800 | compress | METRIC - error 524.36
2025-03-14T12:52:24.929858+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:52:24.930545+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:52:24.931821+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.8.self_attn.k_proj using 512 samples
2025-03-14T12:52:26.058254+0800 | compress | METRIC - time 1.13s
2025-03-14T12:52:26.060089+0800 | compress | METRIC - error 100.43
2025-03-14T12:52:26.061408+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:52:26.062029+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:52:26.063173+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.8.self_attn.v_proj using 512 samples
2025-03-14T12:52:27.207722+0800 | compress | METRIC - time 1.14s
2025-03-14T12:52:27.209873+0800 | comp

(9/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 117.53it/s]
(10/29): Calibrating: 100%|██████████| 512/512 [00:33<00:00, 15.42it/s]

2025-03-14T12:53:16.652523+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.9.self_attn.q_proj using 512 samples





2025-03-14T12:53:17.927948+0800 | compress | METRIC - time 1.27s
2025-03-14T12:53:17.930225+0800 | compress | METRIC - error 619.90
2025-03-14T12:53:17.932463+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:53:17.933710+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:53:17.935029+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.9.self_attn.k_proj using 512 samples
2025-03-14T12:53:19.271385+0800 | compress | METRIC - time 1.34s
2025-03-14T12:53:19.273198+0800 | compress | METRIC - error 106.11
2025-03-14T12:53:19.274108+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:53:19.274727+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:53:19.275898+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.9.self_attn.v_proj using 512 samples
2025-03-14T12:53:20.545418+0800 | compress | METRIC - time 1.27s
2025-03-14T12:53:20.547269+0800 | comp

(10/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 126.36it/s]
(11/29): Calibrating: 100%|██████████| 512/512 [00:33<00:00, 15.38it/s]

2025-03-14T12:54:09.261815+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.10.self_attn.q_proj using 512 samples





2025-03-14T12:54:10.796087+0800 | compress | METRIC - time 1.53s
2025-03-14T12:54:10.798391+0800 | compress | METRIC - error 409.10
2025-03-14T12:54:10.799821+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:54:10.800437+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:54:10.801571+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.10.self_attn.k_proj using 512 samples
2025-03-14T12:54:12.086619+0800 | compress | METRIC - time 1.28s
2025-03-14T12:54:12.088915+0800 | compress | METRIC - error 73.19
2025-03-14T12:54:12.090624+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:54:12.091426+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:54:12.093069+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.10.self_attn.v_proj using 512 samples
2025-03-14T12:54:13.232066+0800 | compress | METRIC - time 1.14s
2025-03-14T12:54:13.234218+0800 | com

(11/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 114.76it/s]
(12/29): Calibrating: 100%|██████████| 512/512 [00:33<00:00, 15.37it/s]

2025-03-14T12:55:03.329740+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.11.self_attn.q_proj using 512 samples





2025-03-14T12:55:04.632207+0800 | compress | METRIC - time 1.30s
2025-03-14T12:55:04.634192+0800 | compress | METRIC - error 426.52
2025-03-14T12:55:04.635837+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:55:04.636479+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:55:04.637595+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.11.self_attn.k_proj using 512 samples
2025-03-14T12:55:05.748076+0800 | compress | METRIC - time 1.11s
2025-03-14T12:55:05.749932+0800 | compress | METRIC - error 90.15
2025-03-14T12:55:05.751396+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:55:05.751996+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:55:05.753382+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.11.self_attn.v_proj using 512 samples
2025-03-14T12:55:06.866980+0800 | compress | METRIC - time 1.11s
2025-03-14T12:55:06.868919+0800 | com

(12/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 126.55it/s]
(13/29): Calibrating: 100%|██████████| 512/512 [00:33<00:00, 15.50it/s]

2025-03-14T12:55:55.161827+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.12.self_attn.q_proj using 512 samples





2025-03-14T12:55:56.444964+0800 | compress | METRIC - time 1.28s
2025-03-14T12:55:56.446977+0800 | compress | METRIC - error 460.08
2025-03-14T12:55:56.448045+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:55:56.448885+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:55:56.450843+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.12.self_attn.k_proj using 512 samples
2025-03-14T12:55:57.554560+0800 | compress | METRIC - time 1.10s
2025-03-14T12:55:57.556250+0800 | compress | METRIC - error 99.49
2025-03-14T12:55:57.557931+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:55:57.558722+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:55:57.560146+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.12.self_attn.v_proj using 512 samples
2025-03-14T12:55:58.684403+0800 | compress | METRIC - time 1.12s
2025-03-14T12:55:58.685719+0800 | com

(13/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 128.76it/s]
(14/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.59it/s]

2025-03-14T12:56:47.146367+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.13.self_attn.q_proj using 512 samples





2025-03-14T12:56:48.425684+0800 | compress | METRIC - time 1.28s
2025-03-14T12:56:48.427174+0800 | compress | METRIC - error 463.04
2025-03-14T12:56:48.428101+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:56:48.428523+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:56:48.429279+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.13.self_attn.k_proj using 512 samples
2025-03-14T12:56:49.525719+0800 | compress | METRIC - time 1.10s
2025-03-14T12:56:49.527385+0800 | compress | METRIC - error 106.60
2025-03-14T12:56:49.528860+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:56:49.529508+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:56:49.530796+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.13.self_attn.v_proj using 512 samples
2025-03-14T12:56:50.620787+0800 | compress | METRIC - time 1.09s
2025-03-14T12:56:50.622536+0800 | co

(14/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 122.37it/s]
(15/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.77it/s]

2025-03-14T12:57:38.984491+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.14.self_attn.q_proj using 512 samples





2025-03-14T12:57:40.356958+0800 | compress | METRIC - time 1.37s
2025-03-14T12:57:40.358945+0800 | compress | METRIC - error 673.45
2025-03-14T12:57:40.360447+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:57:40.361716+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:57:40.364597+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.14.self_attn.k_proj using 512 samples
2025-03-14T12:57:41.483661+0800 | compress | METRIC - time 1.12s
2025-03-14T12:57:41.485565+0800 | compress | METRIC - error 174.74
2025-03-14T12:57:41.486732+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:57:41.487761+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:57:41.490719+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.14.self_attn.v_proj using 512 samples
2025-03-14T12:57:42.610163+0800 | compress | METRIC - time 1.12s
2025-03-14T12:57:42.612140+0800 | co

(15/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 136.83it/s]
(16/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.65it/s]

2025-03-14T12:58:30.872203+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.15.self_attn.q_proj using 512 samples





2025-03-14T12:58:32.166413+0800 | compress | METRIC - time 1.29s
2025-03-14T12:58:32.168788+0800 | compress | METRIC - error 472.63
2025-03-14T12:58:32.170316+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:58:32.170946+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:58:32.172058+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.15.self_attn.k_proj using 512 samples
2025-03-14T12:58:33.270724+0800 | compress | METRIC - time 1.10s
2025-03-14T12:58:33.272538+0800 | compress | METRIC - error 130.28
2025-03-14T12:58:33.273505+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:58:33.274310+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:58:33.276139+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.15.self_attn.v_proj using 512 samples
2025-03-14T12:58:34.388858+0800 | compress | METRIC - time 1.11s
2025-03-14T12:58:34.390725+0800 | co

(16/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 147.88it/s]
(17/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.73it/s]

2025-03-14T12:59:22.043519+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.16.self_attn.q_proj using 512 samples





2025-03-14T12:59:23.560618+0800 | compress | METRIC - time 1.52s
2025-03-14T12:59:23.562810+0800 | compress | METRIC - error 603.14
2025-03-14T12:59:23.564433+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:59:23.565158+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T12:59:23.566718+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.16.self_attn.k_proj using 512 samples
2025-03-14T12:59:24.892854+0800 | compress | METRIC - time 1.33s
2025-03-14T12:59:24.894893+0800 | compress | METRIC - error 207.77
2025-03-14T12:59:24.896371+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T12:59:24.897083+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T12:59:24.898519+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.16.self_attn.v_proj using 512 samples
2025-03-14T12:59:26.115570+0800 | compress | METRIC - time 1.22s
2025-03-14T12:59:26.117377+0800 | co

(17/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 154.29it/s]
(18/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.75it/s]

2025-03-14T13:00:13.605574+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.17.self_attn.q_proj using 512 samples





2025-03-14T13:00:14.868746+0800 | compress | METRIC - time 1.26s
2025-03-14T13:00:14.870576+0800 | compress | METRIC - error 548.60
2025-03-14T13:00:14.872213+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:00:14.872951+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:00:14.874376+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.17.self_attn.k_proj using 512 samples
2025-03-14T13:00:15.958799+0800 | compress | METRIC - time 1.08s
2025-03-14T13:00:15.960698+0800 | compress | METRIC - error 144.49
2025-03-14T13:00:15.962388+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:00:15.963155+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:00:15.964602+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.17.self_attn.v_proj using 512 samples
2025-03-14T13:00:17.060074+0800 | compress | METRIC - time 1.09s
2025-03-14T13:00:17.062215+0800 | co

(18/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 138.31it/s]
(21/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.86it/s]

2025-03-14T13:02:47.643947+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.20.self_attn.q_proj using 512 samples





2025-03-14T13:02:49.204509+0800 | compress | METRIC - time 1.56s
2025-03-14T13:02:49.206137+0800 | compress | METRIC - error 469.20
2025-03-14T13:02:49.207280+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:02:49.208734+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:02:49.209882+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.20.self_attn.k_proj using 512 samples
2025-03-14T13:02:50.525348+0800 | compress | METRIC - time 1.31s
2025-03-14T13:02:50.526991+0800 | compress | METRIC - error 151.65
2025-03-14T13:02:50.528633+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:02:50.529368+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:02:50.530877+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.20.self_attn.v_proj using 512 samples
2025-03-14T13:02:51.863937+0800 | compress | METRIC - time 1.33s
2025-03-14T13:02:51.866095+0800 | co

(21/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 131.68it/s]
(22/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.86it/s]

2025-03-14T13:03:39.928712+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.21.self_attn.q_proj using 512 samples





2025-03-14T13:03:41.429897+0800 | compress | METRIC - time 1.50s
2025-03-14T13:03:41.431895+0800 | compress | METRIC - error 786.42
2025-03-14T13:03:41.433107+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:03:41.434099+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:03:41.435560+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.21.self_attn.k_proj using 512 samples
2025-03-14T13:03:42.750296+0800 | compress | METRIC - time 1.31s
2025-03-14T13:03:42.752338+0800 | compress | METRIC - error 213.99
2025-03-14T13:03:42.753516+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:03:42.754285+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:03:42.755718+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.21.self_attn.v_proj using 512 samples
2025-03-14T13:03:44.095914+0800 | compress | METRIC - time 1.34s
2025-03-14T13:03:44.098034+0800 | co

(22/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 140.55it/s]
(23/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.85it/s]

2025-03-14T13:04:32.499137+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.22.self_attn.q_proj using 512 samples





2025-03-14T13:04:33.788362+0800 | compress | METRIC - time 1.29s
2025-03-14T13:04:33.790119+0800 | compress | METRIC - error 1017.86
2025-03-14T13:04:33.791728+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:04:33.792400+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:04:33.793700+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.22.self_attn.k_proj using 512 samples
2025-03-14T13:04:34.885620+0800 | compress | METRIC - time 1.09s
2025-03-14T13:04:34.887330+0800 | compress | METRIC - error 272.80
2025-03-14T13:04:34.888994+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:04:34.890002+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:04:34.891407+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.22.self_attn.v_proj using 512 samples
2025-03-14T13:04:36.235004+0800 | compress | METRIC - time 1.34s
2025-03-14T13:04:36.237246+0800 | c

(23/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 146.31it/s]
(24/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.79it/s]

2025-03-14T13:05:24.350160+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.23.self_attn.q_proj using 512 samples





2025-03-14T13:05:25.649430+0800 | compress | METRIC - time 1.30s
2025-03-14T13:05:25.651408+0800 | compress | METRIC - error 930.52
2025-03-14T13:05:25.653553+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:05:25.654507+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:05:25.655903+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.23.self_attn.k_proj using 512 samples
2025-03-14T13:05:26.969386+0800 | compress | METRIC - time 1.31s
2025-03-14T13:05:26.971341+0800 | compress | METRIC - error 252.67
2025-03-14T13:05:26.972558+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:05:26.974548+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:05:26.975973+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.23.self_attn.v_proj using 512 samples
2025-03-14T13:05:28.321143+0800 | compress | METRIC - time 1.34s
2025-03-14T13:05:28.323598+0800 | co

(24/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 145.01it/s]
(25/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.76it/s]

2025-03-14T13:06:16.202248+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.24.self_attn.q_proj using 512 samples





2025-03-14T13:06:17.716970+0800 | compress | METRIC - time 1.51s
2025-03-14T13:06:17.719207+0800 | compress | METRIC - error 744.78
2025-03-14T13:06:17.720554+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:06:17.722532+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:06:17.723735+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.24.self_attn.k_proj using 512 samples
2025-03-14T13:06:18.942618+0800 | compress | METRIC - time 1.22s
2025-03-14T13:06:18.944396+0800 | compress | METRIC - error 166.20
2025-03-14T13:06:18.945391+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:06:18.946022+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:06:18.947413+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.24.self_attn.v_proj using 512 samples
2025-03-14T13:06:20.046020+0800 | compress | METRIC - time 1.10s
2025-03-14T13:06:20.048353+0800 | co

(25/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 153.98it/s]
(26/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.73it/s]

2025-03-14T13:07:07.799468+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.25.self_attn.q_proj using 512 samples





2025-03-14T13:07:09.088937+0800 | compress | METRIC - time 1.29s
2025-03-14T13:07:09.090854+0800 | compress | METRIC - error 747.56
2025-03-14T13:07:09.092524+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:07:09.093321+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:07:09.095267+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.25.self_attn.k_proj using 512 samples
2025-03-14T13:07:10.196648+0800 | compress | METRIC - time 1.10s
2025-03-14T13:07:10.199575+0800 | compress | METRIC - error 178.98
2025-03-14T13:07:10.201057+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:07:10.201864+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:07:10.203838+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.25.self_attn.v_proj using 512 samples
2025-03-14T13:07:11.316520+0800 | compress | METRIC - time 1.11s
2025-03-14T13:07:11.318304+0800 | co

(26/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 157.71it/s]
(27/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.77it/s]

2025-03-14T13:07:58.938736+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.26.self_attn.q_proj using 512 samples





2025-03-14T13:08:00.212708+0800 | compress | METRIC - time 1.27s
2025-03-14T13:08:00.214501+0800 | compress | METRIC - error 1150.32
2025-03-14T13:08:00.215669+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:08:00.216683+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:08:00.218443+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.26.self_attn.k_proj using 512 samples
2025-03-14T13:08:01.319475+0800 | compress | METRIC - time 1.10s
2025-03-14T13:08:01.321187+0800 | compress | METRIC - error 208.67
2025-03-14T13:08:01.322773+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:08:01.323513+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:08:01.324276+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.26.self_attn.v_proj using 512 samples
2025-03-14T13:08:02.436191+0800 | compress | METRIC - time 1.11s
2025-03-14T13:08:02.437835+0800 | c

(27/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 151.96it/s]
(28/29): Calibrating: 100%|██████████| 512/512 [00:32<00:00, 15.80it/s]

2025-03-14T13:08:50.649161+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.27.self_attn.q_proj using 512 samples





2025-03-14T13:08:51.936140+0800 | compress | METRIC - time 1.28s
2025-03-14T13:08:51.937848+0800 | compress | METRIC - error 1649.36
2025-03-14T13:08:51.938544+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:08:51.939091+0800 | compress | METRIC - Compressed module size: 25.708032 MB
2025-03-14T13:08:51.940203+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.27.self_attn.k_proj using 512 samples
2025-03-14T13:08:53.035737+0800 | compress | METRIC - time 1.10s
2025-03-14T13:08:53.037154+0800 | compress | METRIC - error 201.83
2025-03-14T13:08:53.038205+0800 | compress | METRIC - GPU 0 | usage: 27.53% | total memory: 85 GB
2025-03-14T13:08:53.038905+0800 | compress | METRIC - Compressed module size: 3.672576 MB
2025-03-14T13:08:53.040644+0800 | on_sequential_batch_end | INFO - Quantizing model.layers.27.self_attn.v_proj using 512 samples
2025-03-14T13:08:54.136834+0800 | compress | METRIC - time 1.10s
2025-03-14T13:08:54.139102+0800 | c

(28/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 134.64it/s]
(29/29): Calibrating: 100%|██████████| 512/512 [00:03<00:00, 167.17it/s]
(29/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 166.50it/s]
manager stage: Modifiers initialized


2025-03-14T13:09:16.906442+0800 | initialize | INFO - Compression lifecycle initialized for 1 modifiers


manager stage: Modifiers finalized


2025-03-14T13:09:16.910122+0800 | finalize | INFO - Compression lifecycle finalized for 1 modifiers


Checking whether model follows 2:4 sparsity structure: 100%|██████████| 197/197 [00:15<00:00, 12.35it/s]


2025-03-14T13:11:06.663747+0800 | get_model_compressor | INFO - Inferring a sparsity configuration requires a global sparsity calculation. This can be costly for large models. To skip the calculation of compression statistics set skip_compression_stats=True


Calculating model sparsity: 100%|██████████| 731/731 [00:11<00:00, 63.85it/s]
Calculating quantization compression ratio: 284it [00:00, 428.08it/s]
Quantized Compression: 100%|██████████| 731/731 [00:07<00:00, 100.42it/s]


('root-W8A8-Dynamic-Per-Token/tokenizer_config.json',
 'root-W8A8-Dynamic-Per-Token/special_tokens_map.json',
 'root-W8A8-Dynamic-Per-Token/vocab.json',
 'root-W8A8-Dynamic-Per-Token/merges.txt',
 'root-W8A8-Dynamic-Per-Token/added_tokens.json',
 'root-W8A8-Dynamic-Per-Token/tokenizer.json')

### Evaluating Accuracy

In [9]:
!pip install lm-eval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting lm-eval
  Downloading http://mirrors.aliyun.com/pypi/packages/c3/0b/36d6117f644f3685e6b87005ecd7051d01e9cdcf617e8e671102c1546de2/lm_eval-0.4.8-py3-none-any.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting evaluate (from lm-eval)
  Downloading http://mirrors.aliyun.com/pypi/packages/a2/e7/cbca9e2d2590eb9b5aa8f7ebabe1beb1498f9462d2ecede5c9fd9735faaf/evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting jsonlines (from lm-eval)
  Downloading http://mirrors.aliyun.com/pypi/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Collecting numexpr (from lm-eval)
  Downloading http://mirrors.aliyun.com/pypi/packages/7d/9c/6b671dd3fb67d7e

In [10]:
lm_eval --model vllm   --model_args pretrained="/root/autodl-fs/data2/anti_fraud/models/modelscope/hub/hub/Qwen2-7B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true   --tasks gsm8k   --num_fewshot 5   --limit 250   --batch_size 'auto'

SyntaxError: invalid syntax (2781918001.py, line 1)