AutoGPTQ · bozheng-hit · Mar 14, 2024 · Mar 18, 2024 · Mar 26, 2024 · fxmarty
diff --git a/auto_gptq/modeling/__init__.py b/auto_gptq/modeling/__init__.py
@@ -19,6 +19,7 @@
 from .phi import PhiGPTQForCausalLM
 from .qwen import QwenGPTQForCausalLM
 from .qwen2 import Qwen2GPTQForCausalLM
+from .qwen2_moe import Qwen2MoeGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
 from .stablelmepoch import StableLMEpochGPTQForCausalLM
 from .starcoder2 import Starcoder2GPTQForCausalLM

diff --git a/auto_gptq/modeling/_base.py b/auto_gptq/modeling/_base.py
@@ -61,9 +61,9 @@
     preprocess_checkpoint_qigen,
     simple_dispatch_model,
     unpack_awq,
+    get_moe_inside_layer_modules,
 )
 
-
 logger = logging.getLogger(__name__)
 handler = logging.StreamHandler()
 formatter = logging.Formatter("%(levelname)s - %(message)s")
@@ -398,6 +398,11 @@ def store_input_hook(_, args, kwargs):
         inside_layer_modules = self.inside_layer_modules
         if not self.quantize_config.true_sequential:
             inside_layer_modules = [sum(inside_layer_modules, [])]
+
+        if hasattr(self.model.config, "num_experts"):
+            inside_layer_modules = get_moe_inside_layer_modules(self.inside_layer_modules,
+                                                                self.model.config.num_experts)
+
         quantizers = {}
         for i in range(len(layers)):
             logger.info(f"Start quantizing layer {i + 1}/{len(layers)}")
@@ -1007,6 +1012,10 @@ def skip(*args, **kwargs):
                     config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
                 )
 
+                if hasattr(config, "num_experts"):
+                    cls.inside_layer_modules = get_moe_inside_layer_modules(cls.inside_layer_modules,
+                                                                            config.num_experts)
+
                 layers = find_layers(model)
                 ignore_layers = [cls.lm_head_name] + cls.outside_layer_modules
                 for name in list(layers.keys()):

diff --git a/auto_gptq/modeling/_const.py b/auto_gptq/modeling/_const.py
@@ -40,6 +40,8 @@
     SUPPORTED_MODELS.append("phi")
 if compare_transformers_version("v4.38.0", op="ge"):
     SUPPORTED_MODELS.append("gemma")
+if compare_transformers_version("v4.39.0.dev0", op="ge"):
+    SUPPORTED_MODELS.append("qwen2_moe")
 if compare_transformers_version("v4.39.0.dev0", op="ge"):
     SUPPORTED_MODELS.append("starcoder2")
 

diff --git a/auto_gptq/modeling/_utils.py b/auto_gptq/modeling/_utils.py
@@ -760,6 +760,21 @@ def get_checkpoints(model_name_or_path: str, extensions: List[str], possible_mod
     return False, resolved_archive_file, true_model_basename
 
 
+# generate inside layer modules for MoE models with massive experts
+def get_moe_inside_layer_modules(inside_layer_modules, num_experts):
+    new_inside_layer_modules = []
+    for names in inside_layer_modules:
+        new_inside_layer_modules.append([])
+        for n in names:
+            if "{expert_idx}" in n:
+                for expert_idx in range(num_experts):
+                    new_inside_layer_modules[-1].append(n.replace("{expert_idx}", str(expert_idx)))
+            else:
+                new_inside_layer_modules[-1].append(n)
+
+    return new_inside_layer_modules
+
+
 __all__ = [
     "get_device",
     "move_to_device",

diff --git a/auto_gptq/modeling/auto.py b/auto_gptq/modeling/auto.py
@@ -22,6 +22,7 @@
 from .phi import PhiGPTQForCausalLM
 from .qwen import QwenGPTQForCausalLM
 from .qwen2 import Qwen2GPTQForCausalLM
+from .qwen2_moe import Qwen2MoeGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
 from .stablelmepoch import StableLMEpochGPTQForCausalLM
 from .starcoder2 import Starcoder2GPTQForCausalLM
@@ -53,6 +54,7 @@
     "starcoder2": Starcoder2GPTQForCausalLM,
     "mixtral": MixtralGPTQForCausalLM,
     "qwen2": Qwen2GPTQForCausalLM,
+    "qwen2_moe": Qwen2MoeGPTQForCausalLM,
     "longllama": LongLlamaGPTQForCausalLM,
     "gemma": GemmaGPTQForCausalLM,
     "phi": PhiGPTQForCausalLM,

diff --git a/auto_gptq/modeling/qwen2_moe.py b/auto_gptq/modeling/qwen2_moe.py
@@ -0,0 +1,18 @@
+from ._base import BaseGPTQForCausalLM
+
+
+class Qwen2MoeGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "Qwen2DecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.shared_expert.up_proj", "mlp.shared_expert.gate_proj"],
+        ["mlp.shared_expert.down_proj"],
+        ["mlp.experts.{expert_idx}.up_proj", "mlp.experts.{expert_idx}.gate_proj"],
+        ["mlp.experts.{expert_idx}.down_proj"],
+    ]
+
+
+__all__ = ["Qwen2MoeGPTQForCausalLM"]