cosmetic fixes

qihqi · qihqi · commit fab667fa5097 · 2024-08-29T23:10:10.000Z
diff --git a/jetstream_pt/cli.py b/jetstream_pt/cli.py
@@ -59,7 +59,7 @@ def create_engine(devices):
   model = fetch_models.instantiate_model_from_repo_id(FLAGS.model_id, env)
   if quant_config.enable_weight_quantization:
     quantize_model.quantize_model(model, quant_config)
-    print('====== model =======')
+    print("====== model =======")
     print(model)
 
   weight_shardings = model.get_sharding_annotations()
@@ -225,7 +225,7 @@ def main(argv):
     return
   else:
     print(
-      "Invalid arguments. please specify 'list', 'serve', or 'interactive'."
+        "Invalid arguments. please specify 'list', 'serve', or 'interactive'."
     )
 
 
diff --git a/jetstream_pt/engine.py b/jetstream_pt/engine.py
@@ -230,7 +230,6 @@ def _call_model_prefill(self, weights, tokens, input_indexes):
     with self._lock:
       with torch_xla2.default_env():
         res = torch.func.functional_call(self.pt_model, paramst, argst)[0]
-        jax.debug.print('Prefill result {}', res._elem)
     caches_res = [c.state() for c in caches]
     return torchjax.from_torch((res, caches_res))
 
@@ -283,7 +282,6 @@ def prefill(
         self.env.temperature,
     )
     token_out = jnp.reshape(token, (1, 1))
-    jax.debug.print('TOKEN is {}', token_out)
     data = jnp.concatenate(
         [
             token_out,  # First token
diff --git a/jetstream_pt/fetch_models.py b/jetstream_pt/fetch_models.py
@@ -13,7 +13,7 @@
 )
 from jetstream_pt.third_party.llama import model_exportable as llama_model
 from jetstream_pt.third_party.mixtral import model as mixtral_model
-from jetstream_pt.third_party.gemma import model as gemma_model 
+from jetstream_pt.third_party.gemma import model as gemma_model
 
 FLAGS = flags.FLAGS
 
@@ -168,7 +168,6 @@ def instantiate_model_from_repo_id(
   weights = _load_weights(model_dir)
   weights = model.convert_hf_weights(weights)
 
-
   model.load_state_dict(weights, assign=True, strict=False)
 
   return model
@@ -190,11 +189,11 @@ def _hf_download(
         local_dir=dest_directory,
         local_dir_use_symlinks=False,
         token=hf_token,
-        # allow_patterns=[
-        #     "model-?????-of-?????.safetensors",
-        #     "*.json",
-        #     "*.model",
-        # ],
+        allow_patterns=[
+            "model-?????-of-?????.safetensors",
+            "*.json",
+            "*.model",
+        ],
     )
   except HTTPError as e:
     if e.response.status_code == 401:
diff --git a/jetstream_pt/hf_tokenizer.py b/jetstream_pt/hf_tokenizer.py
@@ -18,7 +18,9 @@ def encode(self, s: str, **kwargs):
           if padding is used.
     """
     res = self.tokenizer.encode(s, add_special_tokens=False)
-    return token_utils.pad_tokens(res, self.bos_id, self.pad_id, jax_padding=True)
+    return token_utils.pad_tokens(
+        res, self.bos_id, self.pad_id, jax_padding=True
+    )
 
   def decode(self, token_ids: list[int], **kwargs) -> str:
     """Processess input token ids to generate a string.
diff --git a/jetstream_pt/layers.py b/jetstream_pt/layers.py
@@ -330,7 +330,7 @@ def create_quantized_from_nn_embedding(
   )
   weights, scaler, _ = quantize_tensor(float_embedding.weight, 0)
   obj.weight = weights
-  obj.weight_scaler = scaler 
+  obj.weight_scaler = scaler
   return obj
 
 
diff --git a/jetstream_pt/model_base.py b/jetstream_pt/model_base.py
@@ -47,7 +47,7 @@ class AttrProperty:
 
 class ModuleBase(torch.nn.Module, metaclass=abc.ABCMeta):
   """nn Module that allows attaching properties.
-  
+
   This class currently serves 2 goals:
   1. Allow model to specify alternative names for submodules / weights
      this is needed so that it can *also* load HuggingFace checkpoints
@@ -85,7 +85,9 @@ def annotate_sharding(self, name, axis):
     """Set sharding name for a attribute or submodule."""
     self.attr_to_property[name].sharding_axis = axis
 
-  def convert_hf_weights(self, hf_weights: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+  def convert_hf_weights(
+      self, hf_weights: Dict[str, torch.Tensor]
+  ) -> Dict[str, torch.Tensor]:
     """Load state_dict with hg weights."""
     weights = {}
     updated_keys = self.get_hf_names_to_real_name()
@@ -94,9 +96,8 @@ def convert_hf_weights(self, hf_weights: Dict[str, torch.Tensor]) -> Dict[str, t
         weights[updated] = hf_weights[name]
 
     for name in list(weights.keys()):
-      if 'inv_freq' in name:
+      if "inv_freq" in name:
         weights.pop(name)
-    if hasattr(self, 'freqs_cis'):
-      weights['freqs_cis'] = self.freqs_cis
+    if hasattr(self, "freqs_cis"):
+      weights["freqs_cis"] = self.freqs_cis
     return weights
-
diff --git a/jetstream_pt/third_party/gemma/model.py b/jetstream_pt/third_party/gemma/model.py
@@ -277,11 +277,11 @@ def __init__(
     )
 
     self.annotate_sharding("gate_proj.weight", 0)
-    self.annotate_sharding('up_proj.weight', 0)
-    self.annotate_sharding('down_proj.weight', 1)
+    self.annotate_sharding("up_proj.weight", 0)
+    self.annotate_sharding("down_proj.weight", 1)
     self.annotate_sharding("gate_proj.bias", 0)
-    self.annotate_sharding('up_proj.bias', 0)
-    self.annotate_sharding('down_proj.bias', -1)
+    self.annotate_sharding("up_proj.bias", 0)
+    self.annotate_sharding("down_proj.bias", -1)
     if Linear != torch.nn.Linear:
       self.annotate_sharding("gate_proj.weight_scaler", 0)
       self.annotate_sharding("up_proj.weight_scaler", 0)
@@ -418,7 +418,6 @@ def forward(
       freqs_cis = freqs_cis.reshape(bsz, seqlen, -1)
 
     hidden_states = self.embedder(tokens)
-    #jax.debug.print('after embedding {}', hidden_states[-1]._elem)
     hidden_states = hidden_states * (self.config.hidden_size**0.5)
 
     end = None if start is None else (start + input_pos) % self.env.cache_len
@@ -435,7 +434,6 @@ def forward(
           ragged_batch_index=ragged_batch_index,
           ragged_block_index=ragged_block_index,
       )
-      #jax.debug.print('hidden after layer {}: {}', i, hidden_states[-1]._elem)
     hidden_states = self.norm(hidden_states)
 
     embedder_weight = self.embedder.weight
diff --git a/jetstream_pt/third_party/llama/model_exportable.py b/jetstream_pt/third_party/llama/model_exportable.py
@@ -2,7 +2,7 @@
 """This version contains modification to make it easier to trace and support batch."""
 
 from typing import Any, List, Optional
-
+import copy
 import jax
 import torch
 import torch.nn.functional as F
@@ -125,8 +125,6 @@ def __init__(
     self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, device=args.device)
 
     self.hf_name("attention", "self_attn")
-    # We dont want to rename q_proj and k_proj; this is done in 
-    # _load_attention_hf_weights
     self.attention.hf_name("wq", "q_proj")
     self.attention.hf_name("wk", "k_proj")
     self.attention.hf_name("wv", "v_proj")
@@ -140,20 +138,6 @@ def __init__(
     self.hf_name("feed_forward", "mlp")
     self.hf_name("attention_norm", "input_layernorm")
     self.hf_name("ffn_norm", "post_attention_layernorm")
-    self.attention._register_load_state_dict_pre_hook(
-      self._load_attention_hf_weights)
-
-  def _load_attention_hf_weights(self, state_dict, prefix, *args):
-    def transform(val, n_heads):
-      dim1, dim2 = val.shape
-      return val.reshape(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-    qname  = prefix + "wq.weight"
-    kname = prefix + "wk.weight"
-    if qname in state_dict:
-      state_dict[prefix + 'wq.weight'] = transform(state_dict[qname], self.n_heads)
-    if kname in state_dict:
-      state_dict[prefix + 'wk.weight'] = transform(state_dict[kname], self.args.n_kv_heads or self.n_heads)
-
 
   def forward(
       self,
@@ -377,8 +361,23 @@ def from_hf_model_id(cls, model_id, env):
   def drop_weight(self, key):
     return key.startswith("model")
 
-  def shard_weights(self, weights_dict):
-    """Shards the weights
+  def convert_hf_weights(self, hf_weights):
 
-    Assumes the weights_dict is a list of XLATensor2
-    """
+    def transform(val, n_heads):
+      dim1, dim2 = val.shape
+      return (
+          val.reshape(n_heads, 2, dim1 // n_heads // 2, dim2)
+          .transpose(1, 2)
+          .reshape(dim1, dim2)
+      )
+
+    updated = copy.copy(hf_weights)
+
+    for key, value in hf_weights.items():
+      if "q_proj" in key:
+        updated[key] = transform(value, self.params.n_heads)
+      if "k_proj" in key:
+        updated[key] = transform(
+            value, self.params.n_kv_heads or self.params.n_heads
+        )
+    return super().convert_hf_weights(updated)
diff --git a/jetstream_pt/third_party/mixtral/model.py b/jetstream_pt/third_party/mixtral/model.py
@@ -165,24 +165,35 @@ def from_hf_model_id(cls, model_id, env):
     return model
 
   def convert_hf_weights(self, hf_weights):
-    updated_weights = super().convert_hf_weights(hf_weights)
-    # key is layer id, weight name
-    groupped_by_experts = collections.defaultdict(lambda: [None] * 8)
-  
 
+    def transform(val, n_heads):
+      dim1, dim2 = val.shape
+      return (
+          val.reshape(n_heads, 2, dim1 // n_heads // 2, dim2)
+          .transpose(1, 2)
+          .reshape(dim1, dim2)
+      )
+
+    groupped_by_experts = collections.defaultdict(lambda: [None] * 8)
     updated = copy.copy(hf_weights)
     for key, value in hf_weights.items():
-      if 'block_sparse_moe.experts' in key:
+      if "block_sparse_moe.experts" in key:
         #  0       1   2     3              4     5  6   7
-        #"model.layers.0.block_sparse_moe.experts.0.w1.weight"
+        # "model.layers.0.block_sparse_moe.experts.0.w1.weight"
         updated.pop(key)
-        name_pieces = key.split('.')
+        name_pieces = key.split(".")
         assert len(name_pieces) == 8
         layer_id = int(name_pieces[2])
         expert_id = int(name_pieces[5])
         weight_name = name_pieces[6]
         groupped_by_experts[(layer_id, weight_name)][expert_id] = value
 
+      if "q_proj" in key:
+        updated[key] = transform(value, self.config.n_head)
+      if "k_proj" in key:
+        updated[key] = transform(
+            value, self.config.n_local_heads or self.config.n_head
+        )
 
     for (layer_id, weight_name), ws in groupped_by_experts.items():
       name = f"model.layers.{layer_id}.block_sparse_moe.cond_ffn.{weight_name}"
@@ -222,20 +233,6 @@ def __init__(self, config: ModelArgs, env, layer_id) -> None:
 
     self.hf_name("attention_norm", "input_layernorm")
     self.hf_name("ffn_norm", "post_attention_layernorm")
-    
-    self.attention._register_load_state_dict_pre_hook(
-      self._load_attention_hf_weights)
-
-  def _load_attention_hf_weights(self, state_dict, prefix, *args):
-    def transform(val, n_heads):
-      dim1, dim2 = val.shape
-      return val.reshape(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-    qname  = prefix + "wq.weight"
-    kname = prefix + "wk.weight"
-    if qname in state_dict:
-      state_dict[prefix + 'wq.weight'] = transform(state_dict[qname], self.config.n_head)
-    if kname in state_dict:
-      state_dict[prefix + 'wk.weight'] = transform(state_dict[kname], self.config.n_local_heads or self.config.n_head)
 
   def forward(
       self,

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,6 @@ def _call_model_prefill(self, weights, tokens, input_indexes):`
`230`	`230`	`with self._lock:`
`231`	`231`	`with torch_xla2.default_env():`
`232`	`232`	`res = torch.func.functional_call(self.pt_model, paramst, argst)[0]`
`233`		`- jax.debug.print('Prefill result {}', res._elem)`
`234`	`233`	`caches_res = [c.state() for c in caches]`
`235`	`234`	`return torchjax.from_torch((res, caches_res))`
`236`	`235`
`@@ -283,7 +282,6 @@ def prefill(`
`283`	`282`	`self.env.temperature,`
`284`	`283`	`)`
`285`	`284`	`token_out = jnp.reshape(token, (1, 1))`
`286`		`- jax.debug.print('TOKEN is {}', token_out)`
`287`	`285`	`data = jnp.concatenate(`
`288`	`286`	`[`
`289`	`287`	`token_out, # First token`
Original file line number	Diff line number	Diff line change
`@@ -330,7 +330,7 @@ def create_quantized_from_nn_embedding(`
`330`	`330`	`)`
`331`	`331`	`weights, scaler, _ = quantize_tensor(float_embedding.weight, 0)`
`332`	`332`	`obj.weight = weights`
`333`		`- obj.weight_scaler = scaler`
	`333`	`+ obj.weight_scaler = scaler`
`334`	`334`	`return obj`
`335`	`335`
`336`	`336`