In [1]:
!pip install -q gptqmodel==1.7.4 datasets

In [2]:
import torch
import gc
from gptqmodel import GPTQModel, QuantizeConfig
from datasets import load_dataset
import os

In [3]:
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

In [4]:
gc.collect()
torch.cuda.empty_cache()

In [5]:
MODEL_ID = "microsoft/phi-2"
QUANT_OUTPUT_DIR = "./microsoft/phi-2-gptq-4bit"

In [6]:
BITS = 4
GROUP_SIZE = 128
BATCH_SIZE = 1
NUM_SAMPLES = 256

In [7]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
calibration_data = [t for t in dataset["text"] if len(t) > 200][:NUM_SAMPLES]

In [8]:
len(calibration_data)

256

In [9]:
quantize_config = QuantizeConfig(
    bits=BITS,
    group_size=GROUP_SIZE,
    desc_act=False,  # Disable for memory saving
    sym=True,
    true_sequential=True,
    damp_percent=0.01,
)

In [10]:
gc.collect()
torch.cuda.empty_cache()

In [11]:
model = GPTQModel.load(
    MODEL_ID,
    quantize_config=quantize_config,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
)

INFO - Estimated Quantization BPW (bits per weight): 4.1875 bpw, based on [bits: 4, group_size: 128]


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
import time
start_time = time.time()

In [14]:
model.quantize(
    calibration_data,
    batch_size=BATCH_SIZE,
)

INFO - Auto pick kernel based on compatibility: <class 'gptqmodel.nn_modules.qlinear.exllama.ExllamaQuantLinear'>


  |█---------------------------------------| 0:00:00 / 0:00:00 [1/32] 3.1%

INFO - {'layer': 0, 'module': 'self_attn.q_proj', 'loss': '2.11823', 'damp': '0.01000', 'time': '2.918', 'fwd_time': '2.184'}
INFO - {'layer': 0, 'module': 'self_attn.k_proj', 'loss': '2.69769', 'damp': '0.01000', 'time': '1.671', 'fwd_time': '1.196'}
INFO - {'layer': 0, 'module': 'self_attn.v_proj', 'loss': '0.74253', 'damp': '0.01000', 'time': '1.669', 'fwd_time': '0.882'}
INFO - {'layer': 0, 'module': 'self_attn.dense', 'loss': '0.41192', 'damp': '0.01000', 'time': '1.785', 'fwd_time': '2.578'}
INFO - {'layer': 0, 'module': 'mlp.fc1', 'loss': '2.89263', 'damp': '0.01000', 'time': '1.269', 'fwd_time': '0.997'}
INFO - {'layer': 0, 'module': 'mlp.fc2', 'loss': '0.17488', 'damp': '0.01000', 'time': '5.149', 'fwd_time': '4.618'}


 Quantizing mlp.fc2 in layer 0 of 31 |██--------------------------------------| 0:00:32 / 0:08:32 [2/32] 6.2%

INFO - {'layer': 1, 'module': 'self_attn.q_proj', 'loss': '2.02360', 'damp': '0.01000', 'time': '1.021', 'fwd_time': '0.777'}
INFO - {'layer': 1, 'module': 'self_attn.k_proj', 'loss': '2.12476', 'damp': '0.01000', 'time': '1.016', 'fwd_time': '0.777'}
INFO - {'layer': 1, 'module': 'self_attn.v_proj', 'loss': '0.51589', 'damp': '0.01000', 'time': '1.366', 'fwd_time': '0.777'}
INFO - {'layer': 1, 'module': 'self_attn.dense', 'loss': '0.21705', 'damp': '0.01000', 'time': '0.968', 'fwd_time': '0.784'}
INFO - {'layer': 1, 'module': 'mlp.fc1', 'loss': '6.51464', 'damp': '0.01000', 'time': '1.146', 'fwd_time': '0.788'}
INFO - {'layer': 1, 'module': 'mlp.fc2', 'loss': '0.43496', 'damp': '0.01000', 'time': '5.226', 'fwd_time': '4.663'}


 Quantizing mlp.fc2 in layer 1 of 31 |███-------------------------------------| 0:00:55 / 0:09:46 [3/32] 9.4%

INFO - {'layer': 2, 'module': 'self_attn.q_proj', 'loss': '3.04276', 'damp': '0.01000', 'time': '0.971', 'fwd_time': '0.771'}
INFO - {'layer': 2, 'module': 'self_attn.k_proj', 'loss': '3.12524', 'damp': '0.01000', 'time': '0.965', 'fwd_time': '0.787'}
INFO - {'layer': 2, 'module': 'self_attn.v_proj', 'loss': '1.22280', 'damp': '0.01000', 'time': '1.263', 'fwd_time': '0.794'}
INFO - {'layer': 2, 'module': 'self_attn.dense', 'loss': '0.17017', 'damp': '0.01000', 'time': '0.949', 'fwd_time': '0.800'}
INFO - {'layer': 2, 'module': 'mlp.fc1', 'loss': '10.79899', 'damp': '0.01000', 'time': '1.189', 'fwd_time': '0.796'}
INFO - {'layer': 2, 'module': 'mlp.fc2', 'loss': '0.52976', 'damp': '0.01000', 'time': '5.330', 'fwd_time': '4.724'}


 Quantizing mlp.fc2 in layer 2 of 31 |█████-----------------------------------| 0:01:19 / 0:10:32 [4/32] 12.5%

INFO - {'layer': 3, 'module': 'self_attn.q_proj', 'loss': '3.52180', 'damp': '0.01000', 'time': '0.978', 'fwd_time': '0.799'}
INFO - {'layer': 3, 'module': 'self_attn.k_proj', 'loss': '3.35519', 'damp': '0.01000', 'time': '0.974', 'fwd_time': '0.815'}
INFO - {'layer': 3, 'module': 'self_attn.v_proj', 'loss': '2.54775', 'damp': '0.01000', 'time': '1.309', 'fwd_time': '0.802'}
INFO - {'layer': 3, 'module': 'self_attn.dense', 'loss': '0.14040', 'damp': '0.01000', 'time': '0.963', 'fwd_time': '0.844'}
INFO - {'layer': 3, 'module': 'mlp.fc1', 'loss': '17.11711', 'damp': '0.01000', 'time': '1.194', 'fwd_time': '0.808'}
INFO - {'layer': 3, 'module': 'mlp.fc2', 'loss': '0.76169', 'damp': '0.01000', 'time': '5.347', 'fwd_time': '4.753'}


 Quantizing mlp.fc2 in layer 3 of 31 |██████----------------------------------| 0:01:43 / 0:10:59 [5/32] 15.6%

INFO - {'layer': 4, 'module': 'self_attn.q_proj', 'loss': '4.75769', 'damp': '0.01000', 'time': '0.967', 'fwd_time': '0.791'}
INFO - {'layer': 4, 'module': 'self_attn.k_proj', 'loss': '4.51278', 'damp': '0.01000', 'time': '0.976', 'fwd_time': '0.795'}
INFO - {'layer': 4, 'module': 'self_attn.v_proj', 'loss': '3.13824', 'damp': '0.01000', 'time': '3.792', 'fwd_time': '2.367'}
INFO - {'layer': 4, 'module': 'self_attn.dense', 'loss': '0.18074', 'damp': '0.01000', 'time': '1.171', 'fwd_time': '1.460'}
INFO - {'layer': 4, 'module': 'mlp.fc1', 'loss': '18.47416', 'damp': '0.01000', 'time': '1.154', 'fwd_time': '0.789'}
INFO - {'layer': 4, 'module': 'mlp.fc2', 'loss': '1.05056', 'damp': '0.01000', 'time': '4.806', 'fwd_time': '4.697'}


 Quantizing mlp.fc2 in layer 4 of 31 |███████---------------------------------| 0:02:13 / 0:11:49 [6/32] 18.8%

INFO - {'layer': 5, 'module': 'self_attn.q_proj', 'loss': '4.43362', 'damp': '0.01000', 'time': '1.278', 'fwd_time': '0.797'}
INFO - {'layer': 5, 'module': 'self_attn.k_proj', 'loss': '4.18736', 'damp': '0.01000', 'time': '0.939', 'fwd_time': '0.784'}
INFO - {'layer': 5, 'module': 'self_attn.v_proj', 'loss': '2.84038', 'damp': '0.01000', 'time': '0.922', 'fwd_time': '0.789'}
INFO - {'layer': 5, 'module': 'self_attn.dense', 'loss': '0.23088', 'damp': '0.01000', 'time': '0.937', 'fwd_time': '0.792'}
INFO - {'layer': 5, 'module': 'mlp.fc1', 'loss': '16.17841', 'damp': '0.01000', 'time': '1.163', 'fwd_time': '0.794'}
INFO - {'layer': 5, 'module': 'mlp.fc2', 'loss': '1.10820', 'damp': '0.01000', 'time': '4.864', 'fwd_time': '4.727'}


 Quantizing mlp.fc2 in layer 5 of 31 |████████--------------------------------| 0:02:36 / 0:11:53 [7/32] 21.9%

INFO - {'layer': 6, 'module': 'self_attn.q_proj', 'loss': '5.25388', 'damp': '0.01000', 'time': '1.116', 'fwd_time': '0.792'}
INFO - {'layer': 6, 'module': 'self_attn.k_proj', 'loss': '4.99336', 'damp': '0.01000', 'time': '1.049', 'fwd_time': '0.809'}
INFO - {'layer': 6, 'module': 'self_attn.v_proj', 'loss': '3.46194', 'damp': '0.01000', 'time': '0.949', 'fwd_time': '0.790'}
INFO - {'layer': 6, 'module': 'self_attn.dense', 'loss': '0.22624', 'damp': '0.01000', 'time': '0.945', 'fwd_time': '0.789'}
INFO - {'layer': 6, 'module': 'mlp.fc1', 'loss': '17.26792', 'damp': '0.01000', 'time': '1.129', 'fwd_time': '0.799'}
INFO - {'layer': 6, 'module': 'mlp.fc2', 'loss': '1.21729', 'damp': '0.01000', 'time': '4.843', 'fwd_time': '4.737'}


 Quantizing mlp.fc2 in layer 6 of 31 |██████████------------------------------| 0:02:59 / 0:11:56 [8/32] 25.0%

INFO - {'layer': 7, 'module': 'self_attn.q_proj', 'loss': '5.63040', 'damp': '0.01000', 'time': '0.969', 'fwd_time': '0.788'}
INFO - {'layer': 7, 'module': 'self_attn.k_proj', 'loss': '5.36676', 'damp': '0.01000', 'time': '1.365', 'fwd_time': '0.802'}
INFO - {'layer': 7, 'module': 'self_attn.v_proj', 'loss': '3.53270', 'damp': '0.01000', 'time': '0.963', 'fwd_time': '0.787'}
INFO - {'layer': 7, 'module': 'self_attn.dense', 'loss': '0.34166', 'damp': '0.01000', 'time': '0.996', 'fwd_time': '0.801'}
INFO - {'layer': 7, 'module': 'mlp.fc1', 'loss': '17.49547', 'damp': '0.01000', 'time': '1.138', 'fwd_time': '0.792'}
INFO - {'layer': 7, 'module': 'mlp.fc2', 'loss': '1.31532', 'damp': '0.01000', 'time': '4.805', 'fwd_time': '4.720'}


 Quantizing mlp.fc2 in layer 7 of 31 |███████████-----------------------------| 0:03:22 / 0:11:58 [9/32] 28.1%

INFO - {'layer': 8, 'module': 'self_attn.q_proj', 'loss': '5.46586', 'damp': '0.01000', 'time': '0.976', 'fwd_time': '0.787'}
INFO - {'layer': 8, 'module': 'self_attn.k_proj', 'loss': '5.32433', 'damp': '0.01000', 'time': '1.299', 'fwd_time': '0.782'}
INFO - {'layer': 8, 'module': 'self_attn.v_proj', 'loss': '4.10347', 'damp': '0.01000', 'time': '0.938', 'fwd_time': '0.798'}
INFO - {'layer': 8, 'module': 'self_attn.dense', 'loss': '0.24364', 'damp': '0.01000', 'time': '0.961', 'fwd_time': '0.796'}
INFO - {'layer': 8, 'module': 'mlp.fc1', 'loss': '17.23416', 'damp': '0.01000', 'time': '1.160', 'fwd_time': '0.788'}
INFO - {'layer': 8, 'module': 'mlp.fc2', 'loss': '1.38494', 'damp': '0.01000', 'time': '4.818', 'fwd_time': '4.713'}


 Quantizing mlp.fc2 in layer 8 of 31 |████████████----------------------------| 0:03:45 / 0:12:00 [10/32] 31.2%

INFO - {'layer': 9, 'module': 'self_attn.q_proj', 'loss': '5.53879', 'damp': '0.01000', 'time': '0.960', 'fwd_time': '0.785'}
INFO - {'layer': 9, 'module': 'self_attn.k_proj', 'loss': '5.33252', 'damp': '0.01000', 'time': '0.974', 'fwd_time': '0.784'}
INFO - {'layer': 9, 'module': 'self_attn.v_proj', 'loss': '4.10792', 'damp': '0.01000', 'time': '1.208', 'fwd_time': '0.805'}
INFO - {'layer': 9, 'module': 'self_attn.dense', 'loss': '0.25011', 'damp': '0.01000', 'time': '0.978', 'fwd_time': '0.792'}
INFO - {'layer': 9, 'module': 'mlp.fc1', 'loss': '16.97293', 'damp': '0.01000', 'time': '1.147', 'fwd_time': '0.795'}
INFO - {'layer': 9, 'module': 'mlp.fc2', 'loss': '1.39178', 'damp': '0.01000', 'time': '5.113', 'fwd_time': '4.709'}


 Quantizing mlp.fc2 in layer 9 of 31 |█████████████---------------------------| 0:04:08 / 0:12:01 [11/32] 34.4%

INFO - {'layer': 10, 'module': 'self_attn.q_proj', 'loss': '5.72229', 'damp': '0.01000', 'time': '0.950', 'fwd_time': '0.785'}
INFO - {'layer': 10, 'module': 'self_attn.k_proj', 'loss': '5.61028', 'damp': '0.01000', 'time': '0.950', 'fwd_time': '0.782'}
INFO - {'layer': 10, 'module': 'self_attn.v_proj', 'loss': '3.59382', 'damp': '0.01000', 'time': '1.288', 'fwd_time': '0.806'}
INFO - {'layer': 10, 'module': 'self_attn.dense', 'loss': '0.35792', 'damp': '0.01000', 'time': '0.945', 'fwd_time': '0.787'}
INFO - {'layer': 10, 'module': 'mlp.fc1', 'loss': '16.48306', 'damp': '0.01000', 'time': '1.149', 'fwd_time': '0.797'}
INFO - {'layer': 10, 'module': 'mlp.fc2', 'loss': '1.37695', 'damp': '0.01000', 'time': '5.264', 'fwd_time': '4.711'}


 Quantizing mlp.fc2 in layer 10 of 31 |███████████████-------------------------| 0:04:32 / 0:12:05 [12/32] 37.5%

INFO - {'layer': 11, 'module': 'self_attn.q_proj', 'loss': '5.93744', 'damp': '0.01000', 'time': '0.939', 'fwd_time': '0.788'}
INFO - {'layer': 11, 'module': 'self_attn.k_proj', 'loss': '5.79656', 'damp': '0.01000', 'time': '0.943', 'fwd_time': '0.785'}
INFO - {'layer': 11, 'module': 'self_attn.v_proj', 'loss': '3.91025', 'damp': '0.01000', 'time': '1.280', 'fwd_time': '0.788'}
INFO - {'layer': 11, 'module': 'self_attn.dense', 'loss': '0.42792', 'damp': '0.01000', 'time': '0.960', 'fwd_time': '0.814'}
INFO - {'layer': 11, 'module': 'mlp.fc1', 'loss': '15.86397', 'damp': '0.01000', 'time': '1.176', 'fwd_time': '0.788'}
INFO - {'layer': 11, 'module': 'mlp.fc2', 'loss': '1.39113', 'damp': '0.01000', 'time': '5.317', 'fwd_time': '4.711'}


 Quantizing mlp.fc2 in layer 11 of 31 |████████████████------------------------| 0:04:55 / 0:12:06 [13/32] 40.6%

INFO - {'layer': 12, 'module': 'self_attn.q_proj', 'loss': '6.15305', 'damp': '0.01000', 'time': '0.974', 'fwd_time': '0.786'}
INFO - {'layer': 12, 'module': 'self_attn.k_proj', 'loss': '6.06291', 'damp': '0.01000', 'time': '0.945', 'fwd_time': '0.791'}
INFO - {'layer': 12, 'module': 'self_attn.v_proj', 'loss': '4.03690', 'damp': '0.01000', 'time': '1.261', 'fwd_time': '0.789'}
INFO - {'layer': 12, 'module': 'self_attn.dense', 'loss': '0.42181', 'damp': '0.01000', 'time': '0.960', 'fwd_time': '0.822'}
INFO - {'layer': 12, 'module': 'mlp.fc1', 'loss': '15.87794', 'damp': '0.01000', 'time': '1.190', 'fwd_time': '0.793'}
INFO - {'layer': 12, 'module': 'mlp.fc2', 'loss': '1.46027', 'damp': '0.01000', 'time': '5.424', 'fwd_time': '4.703'}


 Quantizing mlp.fc2 in layer 12 of 31 |█████████████████-----------------------| 0:05:19 / 0:12:09 [14/32] 43.8%

INFO - {'layer': 13, 'module': 'self_attn.q_proj', 'loss': '5.96786', 'damp': '0.01000', 'time': '0.968', 'fwd_time': '0.787'}
INFO - {'layer': 13, 'module': 'self_attn.k_proj', 'loss': '5.95349', 'damp': '0.01000', 'time': '0.951', 'fwd_time': '0.785'}
INFO - {'layer': 13, 'module': 'self_attn.v_proj', 'loss': '3.97185', 'damp': '0.01000', 'time': '1.184', 'fwd_time': '0.790'}
INFO - {'layer': 13, 'module': 'self_attn.dense', 'loss': '0.39777', 'damp': '0.01000', 'time': '0.943', 'fwd_time': '0.839'}
INFO - {'layer': 13, 'module': 'mlp.fc1', 'loss': '16.30511', 'damp': '0.01000', 'time': '1.187', 'fwd_time': '0.792'}
INFO - {'layer': 13, 'module': 'mlp.fc2', 'loss': '1.41889', 'damp': '0.01000', 'time': '5.460', 'fwd_time': '4.708'}


 Quantizing mlp.fc2 in layer 13 of 31 |██████████████████----------------------| 0:05:43 / 0:12:11 [15/32] 46.9%

INFO - {'layer': 14, 'module': 'self_attn.q_proj', 'loss': '7.10151', 'damp': '0.01000', 'time': '0.944', 'fwd_time': '0.789'}
INFO - {'layer': 14, 'module': 'self_attn.k_proj', 'loss': '5.87626', 'damp': '0.01000', 'time': '0.962', 'fwd_time': '0.795'}
INFO - {'layer': 14, 'module': 'self_attn.v_proj', 'loss': '3.92800', 'damp': '0.01000', 'time': '1.042', 'fwd_time': '0.788'}
INFO - {'layer': 14, 'module': 'self_attn.dense', 'loss': '0.42202', 'damp': '0.01000', 'time': '1.112', 'fwd_time': '0.829'}
INFO - {'layer': 14, 'module': 'mlp.fc1', 'loss': '15.96297', 'damp': '0.01000', 'time': '1.180', 'fwd_time': '0.794'}
INFO - {'layer': 14, 'module': 'mlp.fc2', 'loss': '1.41793', 'damp': '0.01000', 'time': '5.416', 'fwd_time': '4.709'}


 Quantizing mlp.fc2 in layer 14 of 31 |████████████████████--------------------| 0:06:06 / 0:12:12 [16/32] 50.0%

INFO - {'layer': 15, 'module': 'self_attn.q_proj', 'loss': '5.88884', 'damp': '0.01000', 'time': '0.937', 'fwd_time': '0.788'}
INFO - {'layer': 15, 'module': 'self_attn.k_proj', 'loss': '5.87015', 'damp': '0.01000', 'time': '0.939', 'fwd_time': '0.784'}
INFO - {'layer': 15, 'module': 'self_attn.v_proj', 'loss': '3.72644', 'damp': '0.01000', 'time': '0.957', 'fwd_time': '0.784'}
INFO - {'layer': 15, 'module': 'self_attn.dense', 'loss': '0.41943', 'damp': '0.01000', 'time': '1.292', 'fwd_time': '0.830'}
INFO - {'layer': 15, 'module': 'mlp.fc1', 'loss': '15.15756', 'damp': '0.01000', 'time': '1.183', 'fwd_time': '0.796'}
INFO - {'layer': 15, 'module': 'mlp.fc2', 'loss': '1.50060', 'damp': '0.01000', 'time': '5.455', 'fwd_time': '4.701'}


 Quantizing mlp.fc2 in layer 15 of 31 |█████████████████████-------------------| 0:06:30 / 0:12:14 [17/32] 53.1%

INFO - {'layer': 16, 'module': 'self_attn.q_proj', 'loss': '6.06117', 'damp': '0.01000', 'time': '0.961', 'fwd_time': '0.788'}
INFO - {'layer': 16, 'module': 'self_attn.k_proj', 'loss': '6.02609', 'damp': '0.01000', 'time': '0.951', 'fwd_time': '0.782'}
INFO - {'layer': 16, 'module': 'self_attn.v_proj', 'loss': '3.92634', 'damp': '0.01000', 'time': '0.977', 'fwd_time': '0.788'}
INFO - {'layer': 16, 'module': 'self_attn.dense', 'loss': '0.40726', 'damp': '0.01000', 'time': '1.337', 'fwd_time': '0.815'}
INFO - {'layer': 16, 'module': 'mlp.fc1', 'loss': '15.49315', 'damp': '0.01000', 'time': '1.179', 'fwd_time': '0.795'}
INFO - {'layer': 16, 'module': 'mlp.fc2', 'loss': '1.54409', 'damp': '0.01000', 'time': '5.389', 'fwd_time': '4.705'}


 Quantizing mlp.fc2 in layer 16 of 31 |██████████████████████------------------| 0:06:54 / 0:12:16 [18/32] 56.2%

INFO - {'layer': 17, 'module': 'self_attn.q_proj', 'loss': '5.69675', 'damp': '0.01000', 'time': '0.940', 'fwd_time': '0.793'}
INFO - {'layer': 17, 'module': 'self_attn.k_proj', 'loss': '5.67979', 'damp': '0.01000', 'time': '0.941', 'fwd_time': '0.787'}
INFO - {'layer': 17, 'module': 'self_attn.v_proj', 'loss': '3.60586', 'damp': '0.01000', 'time': '0.994', 'fwd_time': '0.791'}
INFO - {'layer': 17, 'module': 'self_attn.dense', 'loss': '0.42723', 'damp': '0.01000', 'time': '1.281', 'fwd_time': '0.813'}
INFO - {'layer': 17, 'module': 'mlp.fc1', 'loss': '14.23634', 'damp': '0.01000', 'time': '1.185', 'fwd_time': '0.799'}
INFO - {'layer': 17, 'module': 'mlp.fc2', 'loss': '1.58931', 'damp': '0.01000', 'time': '5.818', 'fwd_time': '4.717'}


 Quantizing mlp.fc2 in layer 17 of 31 |███████████████████████-----------------| 0:07:18 / 0:12:17 [19/32] 59.4%

INFO - {'layer': 18, 'module': 'self_attn.q_proj', 'loss': '6.59250', 'damp': '0.01000', 'time': '1.013', 'fwd_time': '0.785'}
INFO - {'layer': 18, 'module': 'self_attn.k_proj', 'loss': '6.57154', 'damp': '0.01000', 'time': '0.976', 'fwd_time': '0.785'}
INFO - {'layer': 18, 'module': 'self_attn.v_proj', 'loss': '3.72203', 'damp': '0.01000', 'time': '0.961', 'fwd_time': '0.791'}
INFO - {'layer': 18, 'module': 'self_attn.dense', 'loss': '0.45506', 'damp': '0.01000', 'time': '1.291', 'fwd_time': '0.830'}
INFO - {'layer': 18, 'module': 'mlp.fc1', 'loss': '14.23481', 'damp': '0.01000', 'time': '1.233', 'fwd_time': '0.793'}
INFO - {'layer': 18, 'module': 'mlp.fc2', 'loss': '1.65092', 'damp': '0.01000', 'time': '5.540', 'fwd_time': '4.722'}


 Quantizing mlp.fc2 in layer 18 of 31 |█████████████████████████---------------| 0:07:42 / 0:12:19 [20/32] 62.5%

INFO - {'layer': 19, 'module': 'self_attn.q_proj', 'loss': '6.04805', 'damp': '0.01000', 'time': '0.944', 'fwd_time': '0.784'}
INFO - {'layer': 19, 'module': 'self_attn.k_proj', 'loss': '6.12464', 'damp': '0.01000', 'time': '0.930', 'fwd_time': '0.787'}
INFO - {'layer': 19, 'module': 'self_attn.v_proj', 'loss': '3.78278', 'damp': '0.01000', 'time': '0.951', 'fwd_time': '0.791'}
INFO - {'layer': 19, 'module': 'self_attn.dense', 'loss': '0.45198', 'damp': '0.01000', 'time': '1.292', 'fwd_time': '0.819'}
INFO - {'layer': 19, 'module': 'mlp.fc1', 'loss': '15.16984', 'damp': '0.01000', 'time': '1.226', 'fwd_time': '0.796'}
INFO - {'layer': 19, 'module': 'mlp.fc2', 'loss': '1.86016', 'damp': '0.01000', 'time': '5.426', 'fwd_time': '4.715'}


 Quantizing mlp.fc2 in layer 19 of 31 |██████████████████████████--------------| 0:08:06 / 0:12:20 [21/32] 65.6%

INFO - {'layer': 20, 'module': 'self_attn.q_proj', 'loss': '7.41917', 'damp': '0.01000', 'time': '0.957', 'fwd_time': '0.786'}
INFO - {'layer': 20, 'module': 'self_attn.k_proj', 'loss': '6.10200', 'damp': '0.01000', 'time': '0.975', 'fwd_time': '0.788'}
INFO - {'layer': 20, 'module': 'self_attn.v_proj', 'loss': '3.55304', 'damp': '0.01000', 'time': '0.950', 'fwd_time': '0.789'}
INFO - {'layer': 20, 'module': 'self_attn.dense', 'loss': '0.51224', 'damp': '0.01000', 'time': '1.295', 'fwd_time': '0.818'}
INFO - {'layer': 20, 'module': 'mlp.fc1', 'loss': '14.95535', 'damp': '0.01000', 'time': '1.200', 'fwd_time': '0.797'}
INFO - {'layer': 20, 'module': 'mlp.fc2', 'loss': '1.93493', 'damp': '0.01000', 'time': '5.468', 'fwd_time': '4.707'}


 Quantizing mlp.fc2 in layer 20 of 31 |███████████████████████████-------------| 0:08:30 / 0:12:21 [22/32] 68.8%

INFO - {'layer': 21, 'module': 'self_attn.q_proj', 'loss': '5.94180', 'damp': '0.01000', 'time': '0.952', 'fwd_time': '0.788'}
INFO - {'layer': 21, 'module': 'self_attn.k_proj', 'loss': '5.74323', 'damp': '0.01000', 'time': '0.946', 'fwd_time': '0.793'}
INFO - {'layer': 21, 'module': 'self_attn.v_proj', 'loss': '4.23128', 'damp': '0.01000', 'time': '0.934', 'fwd_time': '0.789'}
INFO - {'layer': 21, 'module': 'self_attn.dense', 'loss': '0.42899', 'damp': '0.01000', 'time': '1.268', 'fwd_time': '0.803'}
INFO - {'layer': 21, 'module': 'mlp.fc1', 'loss': '15.80085', 'damp': '0.01000', 'time': '1.195', 'fwd_time': '0.822'}
INFO - {'layer': 21, 'module': 'mlp.fc2', 'loss': '2.14384', 'damp': '0.01000', 'time': '5.427', 'fwd_time': '4.711'}


 Quantizing mlp.fc2 in layer 21 of 31 |████████████████████████████------------| 0:08:53 / 0:12:21 [23/32] 71.9%

INFO - {'layer': 22, 'module': 'self_attn.q_proj', 'loss': '6.84453', 'damp': '0.01000', 'time': '0.952', 'fwd_time': '0.787'}
INFO - {'layer': 22, 'module': 'self_attn.k_proj', 'loss': '5.69291', 'damp': '0.01000', 'time': '0.956', 'fwd_time': '0.791'}
INFO - {'layer': 22, 'module': 'self_attn.v_proj', 'loss': '4.38583', 'damp': '0.01000', 'time': '0.969', 'fwd_time': '0.784'}
INFO - {'layer': 22, 'module': 'self_attn.dense', 'loss': '0.32934', 'damp': '0.01000', 'time': '1.314', 'fwd_time': '0.793'}
INFO - {'layer': 22, 'module': 'mlp.fc1', 'loss': '16.28018', 'damp': '0.01000', 'time': '1.209', 'fwd_time': '0.821'}
INFO - {'layer': 22, 'module': 'mlp.fc2', 'loss': '2.21291', 'damp': '0.01000', 'time': '5.432', 'fwd_time': '4.714'}


 Quantizing mlp.fc2 in layer 22 of 31 |██████████████████████████████----------| 0:09:17 / 0:12:22 [24/32] 75.0%

INFO - {'layer': 23, 'module': 'self_attn.q_proj', 'loss': '7.18606', 'damp': '0.01000', 'time': '0.939', 'fwd_time': '0.785'}
INFO - {'layer': 23, 'module': 'self_attn.k_proj', 'loss': '6.27305', 'damp': '0.01000', 'time': '0.950', 'fwd_time': '0.785'}
INFO - {'layer': 23, 'module': 'self_attn.v_proj', 'loss': '4.22199', 'damp': '0.01000', 'time': '0.978', 'fwd_time': '0.787'}
INFO - {'layer': 23, 'module': 'self_attn.dense', 'loss': '0.51650', 'damp': '0.01000', 'time': '1.298', 'fwd_time': '0.795'}
INFO - {'layer': 23, 'module': 'mlp.fc1', 'loss': '17.01013', 'damp': '0.01000', 'time': '1.200', 'fwd_time': '0.820'}
INFO - {'layer': 23, 'module': 'mlp.fc2', 'loss': '2.44517', 'damp': '0.01000', 'time': '5.512', 'fwd_time': '4.721'}


 Quantizing mlp.fc2 in layer 23 of 31 |███████████████████████████████---------| 0:09:41 / 0:12:23 [25/32] 78.1%

INFO - {'layer': 24, 'module': 'self_attn.q_proj', 'loss': '6.11543', 'damp': '0.01000', 'time': '0.956', 'fwd_time': '0.786'}
INFO - {'layer': 24, 'module': 'self_attn.k_proj', 'loss': '6.10191', 'damp': '0.01000', 'time': '0.950', 'fwd_time': '0.785'}
INFO - {'layer': 24, 'module': 'self_attn.v_proj', 'loss': '5.12446', 'damp': '0.01000', 'time': '0.949', 'fwd_time': '0.788'}
INFO - {'layer': 24, 'module': 'self_attn.dense', 'loss': '0.34563', 'damp': '0.01000', 'time': '1.216', 'fwd_time': '0.791'}
INFO - {'layer': 24, 'module': 'mlp.fc1', 'loss': '18.40093', 'damp': '0.01000', 'time': '1.213', 'fwd_time': '0.838'}
INFO - {'layer': 24, 'module': 'mlp.fc2', 'loss': '2.68705', 'damp': '0.01000', 'time': '5.407', 'fwd_time': '4.712'}


 Quantizing mlp.fc2 in layer 24 of 31 |████████████████████████████████--------| 0:10:05 / 0:12:24 [26/32] 81.2%

INFO - {'layer': 25, 'module': 'self_attn.q_proj', 'loss': '7.89439', 'damp': '0.01000', 'time': '0.964', 'fwd_time': '0.783'}
INFO - {'layer': 25, 'module': 'self_attn.k_proj', 'loss': '6.33252', 'damp': '0.01000', 'time': '0.971', 'fwd_time': '0.791'}
INFO - {'layer': 25, 'module': 'self_attn.v_proj', 'loss': '5.30663', 'damp': '0.01000', 'time': '0.938', 'fwd_time': '0.787'}
INFO - {'layer': 25, 'module': 'self_attn.dense', 'loss': '0.50293', 'damp': '0.01000', 'time': '1.212', 'fwd_time': '0.791'}
INFO - {'layer': 25, 'module': 'mlp.fc1', 'loss': '19.94388', 'damp': '0.01000', 'time': '1.188', 'fwd_time': '0.842'}
INFO - {'layer': 25, 'module': 'mlp.fc2', 'loss': '2.74015', 'damp': '0.01000', 'time': '5.583', 'fwd_time': '4.700'}


 Quantizing mlp.fc2 in layer 25 of 31 |█████████████████████████████████-------| 0:10:29 / 0:12:25 [27/32] 84.4%

INFO - {'layer': 26, 'module': 'self_attn.q_proj', 'loss': '9.15390', 'damp': '0.01000', 'time': '0.987', 'fwd_time': '0.786'}
INFO - {'layer': 26, 'module': 'self_attn.k_proj', 'loss': '6.62974', 'damp': '0.01000', 'time': '0.980', 'fwd_time': '0.789'}
INFO - {'layer': 26, 'module': 'self_attn.v_proj', 'loss': '6.04221', 'damp': '0.01000', 'time': '0.937', 'fwd_time': '0.789'}
INFO - {'layer': 26, 'module': 'self_attn.dense', 'loss': '0.79446', 'damp': '0.01000', 'time': '1.306', 'fwd_time': '0.797'}
INFO - {'layer': 26, 'module': 'mlp.fc1', 'loss': '19.91374', 'damp': '0.01000', 'time': '1.517', 'fwd_time': '0.993'}
INFO - {'layer': 26, 'module': 'mlp.fc2', 'loss': '3.12733', 'damp': '0.01000', 'time': '5.455', 'fwd_time': '4.699'}


 Quantizing mlp.fc2 in layer 26 of 31 |███████████████████████████████████-----| 0:10:53 / 0:12:26 [28/32] 87.5%

INFO - {'layer': 27, 'module': 'self_attn.q_proj', 'loss': '7.80589', 'damp': '0.01000', 'time': '0.977', 'fwd_time': '0.791'}
INFO - {'layer': 27, 'module': 'self_attn.k_proj', 'loss': '9.00500', 'damp': '0.01000', 'time': '0.948', 'fwd_time': '0.784'}
INFO - {'layer': 27, 'module': 'self_attn.v_proj', 'loss': '6.47372', 'damp': '0.01000', 'time': '0.970', 'fwd_time': '0.789'}
INFO - {'layer': 27, 'module': 'self_attn.dense', 'loss': '0.70859', 'damp': '0.01000', 'time': '1.147', 'fwd_time': '0.793'}
INFO - {'layer': 27, 'module': 'mlp.fc1', 'loss': '21.75935', 'damp': '0.01000', 'time': '1.244', 'fwd_time': '0.850'}
INFO - {'layer': 27, 'module': 'mlp.fc2', 'loss': '3.99460', 'damp': '0.01000', 'time': '5.442', 'fwd_time': '4.707'}


 Quantizing mlp.fc2 in layer 27 of 31 |████████████████████████████████████----| 0:11:17 / 0:12:27 [29/32] 90.6%

INFO - {'layer': 28, 'module': 'self_attn.q_proj', 'loss': '6.75612', 'damp': '0.01000', 'time': '0.990', 'fwd_time': '0.787'}
INFO - {'layer': 28, 'module': 'self_attn.k_proj', 'loss': '6.88027', 'damp': '0.01000', 'time': '0.981', 'fwd_time': '0.787'}
INFO - {'layer': 28, 'module': 'self_attn.v_proj', 'loss': '6.79009', 'damp': '0.01000', 'time': '0.951', 'fwd_time': '0.790'}
INFO - {'layer': 28, 'module': 'self_attn.dense', 'loss': '0.80733', 'damp': '0.01000', 'time': '1.164', 'fwd_time': '0.797'}
INFO - {'layer': 28, 'module': 'mlp.fc1', 'loss': '22.58532', 'damp': '0.01000', 'time': '1.363', 'fwd_time': '0.836'}
INFO - {'layer': 28, 'module': 'mlp.fc2', 'loss': '4.88541', 'damp': '0.01000', 'time': '5.915', 'fwd_time': '4.706'}


 Quantizing mlp.fc2 in layer 28 of 31 |█████████████████████████████████████---| 0:11:42 / 0:12:28 [30/32] 93.8%

INFO - {'layer': 29, 'module': 'self_attn.q_proj', 'loss': '143.12994', 'damp': '0.01000', 'time': '1.071', 'fwd_time': '0.793'}
INFO - {'layer': 29, 'module': 'self_attn.k_proj', 'loss': '53.28530', 'damp': '0.01000', 'time': '1.033', 'fwd_time': '0.784'}
INFO - {'layer': 29, 'module': 'self_attn.v_proj', 'loss': '4.23145', 'damp': '0.01000', 'time': '1.030', 'fwd_time': '0.782'}
INFO - {'layer': 29, 'module': 'self_attn.dense', 'loss': '2.60639', 'damp': '0.01000', 'time': '1.372', 'fwd_time': '0.795'}
INFO - {'layer': 29, 'module': 'mlp.fc1', 'loss': '27.76401', 'damp': '0.01000', 'time': '1.264', 'fwd_time': '0.887'}
INFO - {'layer': 29, 'module': 'mlp.fc2', 'loss': '5.47490', 'damp': '0.01000', 'time': '5.658', 'fwd_time': '4.725'}


 Quantizing mlp.fc2 in layer 29 of 31 |██████████████████████████████████████--| 0:12:06 / 0:12:29 [31/32] 96.9%

INFO - {'layer': 30, 'module': 'self_attn.q_proj', 'loss': '623.72046', 'damp': '0.01000', 'time': '1.004', 'fwd_time': '0.786'}
INFO - {'layer': 30, 'module': 'self_attn.k_proj', 'loss': '59.48251', 'damp': '0.01000', 'time': '1.035', 'fwd_time': '0.797'}
INFO - {'layer': 30, 'module': 'self_attn.v_proj', 'loss': '3.22752', 'damp': '0.01000', 'time': '1.009', 'fwd_time': '0.796'}
INFO - {'layer': 30, 'module': 'self_attn.dense', 'loss': '1.87075', 'damp': '0.01000', 'time': '1.316', 'fwd_time': '0.814'}
INFO - {'layer': 30, 'module': 'mlp.fc1', 'loss': '30.52082', 'damp': '0.01000', 'time': '1.327', 'fwd_time': '0.835'}
INFO - {'layer': 30, 'module': 'mlp.fc2', 'loss': '5.91598', 'damp': '0.01000', 'time': '5.724', 'fwd_time': '4.738'}


 Quantizing mlp.fc2 in layer 30 of 31 |████████████████████████████████████████| 0:12:31 / 0:12:31 [32/32] 100.0%

INFO - {'layer': 31, 'module': 'self_attn.q_proj', 'loss': '157.42490', 'damp': '0.01000', 'time': '1.020', 'fwd_time': '0.784'}
INFO - {'layer': 31, 'module': 'self_attn.k_proj', 'loss': '28.94958', 'damp': '0.01000', 'time': '1.033', 'fwd_time': '0.789'}
INFO - {'layer': 31, 'module': 'self_attn.v_proj', 'loss': '1.76336', 'damp': '0.01000', 'time': '0.990', 'fwd_time': '0.782'}
INFO - {'layer': 31, 'module': 'self_attn.dense', 'loss': '0.59219', 'damp': '0.01000', 'time': '1.440', 'fwd_time': '0.828'}
INFO - {'layer': 31, 'module': 'mlp.fc1', 'loss': '18.25520', 'damp': '0.01000', 'time': '1.243', 'fwd_time': '0.795'}
INFO - {'layer': 31, 'module': 'mlp.fc2', 'loss': '3.60578', 'damp': '0.01000', 'time': '5.695', 'fwd_time': '4.707'}
INFO - Quantization summary:
[{'layer': 0, 'module': 'self_attn.q_proj', 'loss': '2.11823', 'damp': '0.01000', 'time': '2.918', 'fwd_time': '2.184'}, {'layer': 0, 'module': 'self_attn.k_proj', 'loss': '2.69769', 'damp': '0.01000', 'time': '1.671', 'fwd_

 Packing model.layers.31.mlp.fc2 |----------------------------------------| 100.0%

INFO - Model packed.


 Quantizing mlp.fc2 in layer 31 of 31 |----------------------------------------| 100.0%

[{'layer': 0,
  'module': 'self_attn.q_proj',
  'loss': '2.11823',
  'damp': '0.01000',
  'time': '2.918',
  'fwd_time': '2.184'},
 {'layer': 0,
  'module': 'self_attn.k_proj',
  'loss': '2.69769',
  'damp': '0.01000',
  'time': '1.671',
  'fwd_time': '1.196'},
 {'layer': 0,
  'module': 'self_attn.v_proj',
  'loss': '0.74253',
  'damp': '0.01000',
  'time': '1.669',
  'fwd_time': '0.882'},
 {'layer': 0,
  'module': 'self_attn.dense',
  'loss': '0.41192',
  'damp': '0.01000',
  'time': '1.785',
  'fwd_time': '2.578'},
 {'layer': 0,
  'module': 'mlp.fc1',
  'loss': '2.89263',
  'damp': '0.01000',
  'time': '1.269',
  'fwd_time': '0.997'},
 {'layer': 0,
  'module': 'mlp.fc2',
  'loss': '0.17488',
  'damp': '0.01000',
  'time': '5.149',
  'fwd_time': '4.618'},
 {'layer': 1,
  'module': 'self_attn.q_proj',
  'loss': '2.02360',
  'damp': '0.01000',
  'time': '1.021',
  'fwd_time': '0.777'},
 {'layer': 1,
  'module': 'self_attn.k_proj',
  'loss': '2.12476',
  'damp': '0.01000',
  'time': '1.0

In [15]:
elapsed = time.time() - start_time

In [25]:
print(f"✅ QUANTIZATION COMPLETE in {elapsed/60:.1f} minutes")

✅ QUANTIZATION COMPLETE in 14.8 minutes


In [16]:
model.save(QUANT_OUTPUT_DIR)

files = os.listdir(QUANT_OUTPUT_DIR)
total_size = sum(
    os.path.getsize(os.path.join(QUANT_OUTPUT_DIR, f))
    for f in files
    if os.path.isfile(os.path.join(QUANT_OUTPUT_DIR, f))
) / (1024**2)

print(f"✅ Saved: {QUANT_OUTPUT_DIR} ({total_size:.1f} MB)")

INFO - Pre-Quantized model size: 5301.87MB, 5.18GB
INFO - Quantized model size: 1751.62MB, 1.71GB
INFO - Size difference: 3550.25MB, 3.47GB - 66.96%


✅ Saved: ./microsoft/phi-2-gptq-4bit (1756.2 MB)


In [17]:
gc.collect()
torch.cuda.empty_cache()

In [18]:
model = GPTQModel.load(QUANT_OUTPUT_DIR, device="cuda:0")

INFO - Estimated Quantization BPW (bits per weight): 4.1875 bpw, based on [bits: 4, group_size: 128]
`torch_dtype` is deprecated! Use `dtype` instead!
INFO - Converting `checkpoint_format` from `gptq` to `gptq_v2`.
INFO - Conversion complete: 0.18242835998535156s


In [20]:
prompt = "What is artificial intelligence?"
result = model.generate(prompt, max_new_tokens=250)[0]
output = model.tokenizer.decode(result, skip_special_tokens=True)

print(f"\nPrompt: {prompt}")
print(f"Output: {output}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: What is artificial intelligence?
Output: What is artificial intelligence?
AI is the ability of machines to perform tasks that normally require human intelligence, such as learning, reasoning, and problem-solving. AI can be classified into two types: narrow AI and general AI. Narrow AI is the type of AI that can perform specific tasks, such as playing chess, recognizing faces, or driving cars. General AI is the type of AI that can perform any task that a human can, but it is still under development and not yet achieved.

What are the benefits of artificial intelligence?
AI has many benefits for various fields and industries, such as:

- Improving the quality and efficiency of products and services
- Enhancing the accuracy and speed of decision-making
- Reducing the costs and risks of human errors
- Creating new opportunities and innovations
- Solving complex and global problems

What are the challenges of artificial intelligence?
AI also has some challenges and risks, such as:
