AutoGPTQ · Qubitium · Feb 21, 2024 · Feb 21, 2024 · Feb 22, 2024 · Feb 25, 2024
diff --git a/auto_gptq/__init__.py b/auto_gptq/__init__.py
@@ -1,6 +1,4 @@
 from .modeling import AutoGPTQForCausalLM, BaseQuantizeConfig
 from .utils.exllama_utils import exllama_set_max_input_length
 from .utils.peft_utils import get_gptq_peft_model
-
-
-__version__ = "0.8.0.dev0"
+from .version import __version__
diff --git a/auto_gptq/modeling/__init__.py b/auto_gptq/modeling/__init__.py
@@ -6,6 +6,7 @@
 from .cohere import CohereGPTQForCausalLM
 from .decilm import DeciLMGPTQForCausalLM
 from .gemma import GemmaGPTQForCausalLM
+from .chatglm import ChatGLMForCausalLM
 from .gpt2 import GPT2GPTQForCausalLM
 from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
 from .gpt_neox import GPTNeoXGPTQForCausalLM

diff --git a/auto_gptq/modeling/_base.py b/auto_gptq/modeling/_base.py
diff --git a/auto_gptq/modeling/_const.py b/auto_gptq/modeling/_const.py
@@ -15,6 +15,7 @@
     "moss",
     "gpt_bigcode",
     "codegen",
+    "chatglm",
     "RefinedWebModel",
     "RefinedWeb",
     "baichuan",
@@ -24,7 +25,6 @@
     "deci",
     "stablelm_epoch",
     "mpt",
-    "cohere",
 ]
 if compare_transformers_version("v4.28.0", op="ge"):
     SUPPORTED_MODELS.append("llama")
@@ -42,8 +42,9 @@
     SUPPORTED_MODELS.append("phi")
 if compare_transformers_version("v4.38.0", op="ge"):
     SUPPORTED_MODELS.append("gemma")
-if compare_transformers_version("v4.39.0.dev0", op="ge"):
+if compare_transformers_version("v4.39.0", op="ge"):
     SUPPORTED_MODELS.append("starcoder2")
+    SUPPORTED_MODELS.append("cohere")
 
 EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
 

diff --git a/auto_gptq/modeling/_utils.py b/auto_gptq/modeling/_utils.py
@@ -5,6 +5,7 @@
 from typing import List, Optional, Union
 
 import accelerate
+import threadpoolctl as tctl
 import numpy as np
 import torch
 import torch.nn as nn
@@ -13,6 +14,7 @@
 from transformers import AutoConfig
 from transformers.utils.hub import cached_file
 
+from ..quantization import BaseQuantizeConfig
 from ..utils.import_utils import dynamically_import_QuantLinear
 from ..utils.modeling_utils import recurse_setattr
 from ._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, SUPPORTED_MODELS
@@ -147,6 +149,71 @@ def make_quant(
             recurse_setattr(module, name, new_layer.to(ori_layer_device))
 
 
+def convert_gptq_v1_to_v2_format(
+    model,
+    quantize_config: BaseQuantizeConfig,
+    qlinear_kernel: nn.Module,
+):
+    use_qigen = qlinear_kernel.QUANT_TYPE == "qigen"
+
+    # Limit thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        for _, submodule in model.named_modules():
+            # v1 checkpoint format used to do `qzeros = qzeros -= 1` before serialization, thus the
+            # additions here do not overflow.
+            # v1 checkpoint format with sym=False saved via convert_gptq_v2_to_v1_format() will
+            # overflow ~<=13% based on testing
+            if isinstance(submodule, qlinear_kernel):
+                if use_qigen:
+                    submodule.zeros.data += 1
+                else:
+                    if quantize_config.bits == 2:
+                        submodule.qzeros.data += 0b01010101010101010101010101010101
+                    elif quantize_config.bits == 3:
+                        submodule.qzeros.data[:,range(0,submodule.qzeros.data.shape[1],3)] += 0b00100100100100100100100100100100
+                        submodule.qzeros.data[:,range(1,submodule.qzeros.data.shape[1],3)] += 0b10010010010010010010010010010010
+                        submodule.qzeros.data[:,range(2,submodule.qzeros.data.shape[1],3)] += 0b01001001001001001001001001001001
+                    elif quantize_config.bits == 4:
+                        submodule.qzeros.data += 0b00010001000100010001000100010001
+                    elif quantize_config.bits == 8:
+                        submodule.qzeros.data += 0b00000001000000010000000100000001
+                    else:
+                        raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+    return model
+
+
+def convert_gptq_v2_to_v1_format(
+    model,
+    quantize_config: BaseQuantizeConfig,
+    qlinear_kernel: nn.Module,
+):
+    use_qigen = qlinear_kernel.QUANT_TYPE == "qigen"
+
+    # Limit thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        for _, submodule in model.named_modules():
+            # sym=False has underflow probability of ~<=13% during testing. No underflow possible for sym=True.
+            if isinstance(submodule, qlinear_kernel):
+                if use_qigen:
+                    submodule.zeros.data -= 1
+                else:
+                    if quantize_config.bits == 2:
+                        submodule.qzeros.data -= 0b01010101010101010101010101010101
+                    elif quantize_config.bits == 3:
+                        submodule.qzeros.data[:,range(0,submodule.qzeros.data.shape[1],3)] -= 0b00100100100100100100100100100100
+                        submodule.qzeros.data[:,range(1,submodule.qzeros.data.shape[1],3)] -= 0b10010010010010010010010010010010
+                        submodule.qzeros.data[:,range(2,submodule.qzeros.data.shape[1],3)] -= 0b01001001001001001001001001001001
+                    elif quantize_config.bits == 4:
+                        submodule.qzeros.data -= 0b00010001000100010001000100010001
+                    elif quantize_config.bits == 8:
+                        submodule.qzeros.data -= 0b00000001000000010000000100000001
+                    else:
+                        raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+    return model
+
+
 def preprocess_checkpoint_qigen(
     module,
     names,
@@ -297,32 +364,36 @@ def pack_model(
     )
     qlayers = find_layers(model, [QuantLinear])
 
-    pbar = tqdm(qlayers.keys(), leave=True)
-    for name in pbar:
-        pbar.set_description(f"Packing {name}...", refresh=True)
-
-        quantizers[name], scale, zero, g_idx = quantizers[name]
-        # so far can only pack layer on CPU
-        layer_device = qlayers[name].device
-        qlayers[name].to(CPU)
-        layers[name], scale, zero, g_idx = (
-            layers[name].to(CPU),
-            scale.to(CPU),
-            zero.to(CPU),
-            g_idx.to(CPU),
-        )
-        if QuantLinear.QUANT_TYPE == "marlin":
-            qlayers[name].pack(layers[name], scale)
-        else:
-            qlayers[name].pack(layers[name], scale, zero, g_idx)
-        qlayers[name].to(layer_device)
+    # Limit pack() thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        pbar = tqdm(qlayers.keys(), leave=True)
+        for name in pbar:
+            pbar.set_description(f"Packing {name}")
+
+            quantizers[name], scale, zero, g_idx = quantizers[name]
+            # so far can only pack layer on CPU
+            layer_device = qlayers[name].device
+            qlayers[name].to(CPU)
+            layers[name], scale, zero, g_idx = (
+                layers[name].to(CPU),
+                scale.to(CPU),
+                zero.to(CPU),
+                g_idx.to(CPU),
+            )
+            if QuantLinear.QUANT_TYPE == "marlin":
+                qlayers[name].pack(layers[name], scale)
+            else:
+                qlayers[name].pack(layers[name], scale, zero, g_idx)
+            qlayers[name].to(layer_device)
+
     logger.info("Model packed.")
 
     if use_triton and warmup_triton:
         logger.warning(
             "using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model."
         )
         QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen)
+    return QuantLinear
 
 
 def check_and_get_model_type(model_dir, trust_remote_code=False):
@@ -475,7 +546,7 @@ def autogptq_post_init(model, use_act_order: bool, max_input_length: Optional[in
             if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllama":
                 submodule.post_init()
 
-    ## exllamav2
+    # exllamav2
     fixed_bytes = {}
     model_uses_exllamav2 = False
 
@@ -574,8 +645,6 @@ def unpack_awq(
         torch.int16 if bits == 8 else torch.int8
     )
 
-    # zeros = zeros + 1
-
     torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
 
     zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
@@ -671,7 +740,6 @@ def pack_from_tensors(
     qweight = qweight.astype(np.int32)
     qweight = torch.from_numpy(qweight)
 
-    unpacked_qzeros = unpacked_qzeros - 1
     torch.bitwise_and(unpacked_qzeros, (2**bits) - 1, out=unpacked_qzeros)
 
     unpacked_qzeros = unpacked_qzeros.numpy().astype(np.uint32)
@@ -775,4 +843,6 @@ def get_checkpoints(model_name_or_path: str, extensions: List[str], possible_mod
     "check_and_get_model_type",
     "simple_dispatch_model",
     "make_sure_no_tensor_in_meta_device",
+    "convert_gptq_v1_to_v2_format",
+    "convert_gptq_v2_to_v1_format",
 ]
diff --git a/auto_gptq/modeling/auto.py b/auto_gptq/modeling/auto.py
@@ -9,6 +9,7 @@
 from .cohere import CohereGPTQForCausalLM
 from .decilm import DeciLMGPTQForCausalLM
 from .gemma import GemmaGPTQForCausalLM
+from .chatglm import ChatGLMForCausalLM
 from .gpt2 import GPT2GPTQForCausalLM
 from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
 from .gpt_neox import GPTNeoXGPTQForCausalLM
@@ -39,6 +40,7 @@
     "llama": LlamaGPTQForCausalLM,
     "opt": OPTGPTQForCausalLM,
     "moss": MOSSGPTQForCausalLM,
+    "chatglm": ChatGLMForCausalLM,
     "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
     "codegen": CodeGenGPTQForCausalLM,
     "cohere": CohereGPTQForCausalLM,
@@ -98,8 +100,6 @@ def from_quantized(
         device: Optional[Union[str, int]] = None,
         low_cpu_mem_usage: bool = False,
         use_triton: bool = False,
-        inject_fused_attention: bool = False,
-        inject_fused_mlp: bool = False,
         use_cuda_fp16: bool = True,
         quantize_config: Optional[BaseQuantizeConfig] = None,
         model_basename: Optional[str] = None,
@@ -148,8 +148,6 @@ def from_quantized(
             device=device,
             low_cpu_mem_usage=low_cpu_mem_usage,
             use_triton=use_triton,
-            inject_fused_attention=inject_fused_attention,
-            inject_fused_mlp=inject_fused_mlp,
             use_cuda_fp16=use_cuda_fp16,
             quantize_config=quantize_config,
             model_basename=model_basename,

diff --git a/auto_gptq/modeling/chatglm.py b/auto_gptq/modeling/chatglm.py
@@ -0,0 +1,15 @@
+from ._base import BaseGPTQForCausalLM
+
+
+class ChatGLMForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GLMBlock"
+    layers_block_name = "transformer.encoder.layers"
+    outside_layer_modules = ["transformer.embedding.word_embeddings", "transformer.output_layer"]
+    inside_layer_modules = [
+        ["self_attention.query_key_value"],
+        ["self_attention.dense"],
+        ["mlp.dense_h_to_4h"],
+        ["mlp.dense_4h_to_h"],
+    ]
+
+__all__ = ["ChatGLMForCausalLM"]
diff --git a/auto_gptq/modeling/cohere.py b/auto_gptq/modeling/cohere.py
@@ -17,4 +17,3 @@ class CohereGPTQForCausalLM(BaseGPTQForCausalLM):
     ]
 
 __all__ = ["CohereGPTQForCausalLM"]
-
diff --git a/auto_gptq/modeling/decilm.py b/auto_gptq/modeling/decilm.py
@@ -4,13 +4,6 @@
 from ._base import BaseGPTQForCausalLM
 
 
-if compare_transformers_version("v4.28.0", op="ge"):
-    from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
-    from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
-else:
-    FusedLlamaAttentionForQuantizedModel = None
-    FusedLlamaMLPForQuantizedModel = None
-
 logger = getLogger(__name__)
 
 
@@ -25,8 +18,5 @@ class DeciLMGPTQForCausalLM(BaseGPTQForCausalLM):
         ["mlp.down_proj"],
     ]
 
-    fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
-    fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
-
 
 __all__ = ["DeciLMGPTQForCausalLM"]
diff --git a/auto_gptq/modeling/gptj.py b/auto_gptq/modeling/gptj.py
@@ -1,4 +1,3 @@
-from ..nn_modules.fused_gptj_attn import FusedGPTJAttentionForQuantizedModel
 from ._base import BaseGPTQForCausalLM
 
 
@@ -13,7 +12,5 @@ class GPTJGPTQForCausalLM(BaseGPTQForCausalLM):
         ["mlp.fc_out"],
     ]
 
-    fused_attn_module_type = FusedGPTJAttentionForQuantizedModel
-
 
 __all__ = ["GPTJGPTQForCausalLM"]
diff --git a/auto_gptq/modeling/llama.py b/auto_gptq/modeling/llama.py
@@ -1,16 +1,8 @@
 from logging import getLogger
 
-from ..utils.import_utils import compare_transformers_version
 from ._base import BaseGPTQForCausalLM
 
 
-if compare_transformers_version("v4.28.0", op="ge"):
-    from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
-    from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
-else:
-    FusedLlamaAttentionForQuantizedModel = None
-    FusedLlamaMLPForQuantizedModel = None
-
 logger = getLogger(__name__)
 
 
@@ -25,8 +17,5 @@ class LlamaGPTQForCausalLM(BaseGPTQForCausalLM):
         ["mlp.down_proj"],
     ]
 
-    fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
-    fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
-
 
 __all__ = ["LlamaGPTQForCausalLM"]
diff --git a/auto_gptq/modeling/longllama.py b/auto_gptq/modeling/longllama.py
@@ -4,13 +4,6 @@
 from ._base import BaseGPTQForCausalLM
 
 
-if compare_transformers_version("v4.28.0", op="ge"):
-    from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
-    from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
-else:
-    FusedLlamaAttentionForQuantizedModel = None
-    FusedLlamaMLPForQuantizedModel = None
-
 logger = getLogger(__name__)
 
 
@@ -25,8 +18,5 @@ class LongLlamaGPTQForCausalLM(BaseGPTQForCausalLM):
         ["mlp.down_proj"],
     ]
 
-    fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
-    fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
-
 
 __all__ = ["LongLlamaGPTQForCausalLM"]
diff --git a/auto_gptq/modeling/mpt.py b/auto_gptq/modeling/mpt.py
@@ -1,4 +1,4 @@
-from auto_gptq.modeling import BaseGPTQForCausalLM
+from ._base import BaseGPTQForCausalLM
 
 
 class MPTGPTQForCausalLM(BaseGPTQForCausalLM):

diff --git a/auto_gptq/modeling/stablelmepoch.py b/auto_gptq/modeling/stablelmepoch.py
@@ -4,13 +4,6 @@
 from ._base import BaseGPTQForCausalLM
 
 
-if compare_transformers_version("v4.28.0", op="ge"):
-    from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
-    from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
-else:
-    FusedLlamaAttentionForQuantizedModel = None
-    FusedLlamaMLPForQuantizedModel = None
-
 logger = getLogger(__name__)
 
 
@@ -25,8 +18,5 @@ class StableLMEpochGPTQForCausalLM(BaseGPTQForCausalLM):
         ["mlp.down_proj"],
     ]
 
-    fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
-    fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
-
 
 __all__ = ["StableLMEpochGPTQForCausalLM"]