AI-Hypercomputer · wenxindongwork · Mar 10, 2025 · Mar 7, 2025
diff --git a/kithara/dataset/text_completion.py b/kithara/dataset/text_completion.py
@@ -66,10 +66,7 @@ def __init__(
         ), "Either a HF Tokenizer or a HF tokenizer handle must be provided"
 
         self.max_seq_len = max_seq_len
-        self.tokenizer = (
-            initialize_tokenizer(tokenizer_handle) if tokenizer is None else tokenizer
-        )
-        self.tokenizer.pad_token = "<pad>"
+        self.tokenizer = initialize_tokenizer(tokenizer_handle, tokenizer)
         self.column_mapping = {"text": "text"}
         self._model_type = model_type
         self.custom_formatting_fn = custom_formatting_fn

diff --git a/kithara/dataset/utils.py b/kithara/dataset/utils.py
@@ -20,14 +20,32 @@
 from functools import lru_cache
 
 @lru_cache(maxsize=5) 
-def initialize_tokenizer(tokenizer_handle):
-    """Creates an HuggingFace AutoTokenizer with the tokenizer_handle."""
-    if tokenizer_handle.startswith("hf://"):
-        tokenizer_handle = tokenizer_handle.removeprefix("hf://")
-    try:
-        tokenizer = HFTokenizer.from_pretrained(tokenizer_handle)
-    except ValueError as e:
-        print("Tokenizer handle is not a valid HuggingFace tokenizer handle.")
+def initialize_tokenizer(tokenizer_handle, tokenizer=None):
+    """
+    Creates an HuggingFace AutoTokenizer with the tokenizer_handle.
+    Try to specify a pad_token for the tokenizer if not specified.
+    """
+    if tokenizer is None:
+        if tokenizer_handle.startswith("hf://"):
+            tokenizer_handle = tokenizer_handle.removeprefix("hf://")
+        try:
+            tokenizer = HFTokenizer.from_pretrained(tokenizer_handle)
+        except ValueError as e:
+            print(f"Tokenizer handle {tokenizer_handle} is not a valid HuggingFace tokenizer handle.")
+
+    # Llama tokenizers don't have a default pad_token, we must add it here
+    if tokenizer.pad_token is None: 
+        # Gemma 2 
+        if tokenizer.get_vocab().get("<pad>") is not None: 
+            tokenizer.add_special_tokens({'pad_token': '<pad>'})
+        # Llama 3 uses "<|finetune_right_pad_id|>"
+        elif tokenizer.get_vocab().get("<|finetune_right_pad_id|>") is not None: 
+            tokenizer.add_special_tokens({'pad_token': '<|finetune_right_pad_id|>'})
+        # Llama 2 doesn't have a padding token, fall back to "<unk>"
+        elif tokenizer.get_vocab().get("<unk>") is not None: 
+            tokenizer.add_special_tokens({'pad_token': '<unk>'})
+        else:
+            print("WARNING: Tokenizer doesn't have the attribute pad_token")
     return tokenizer
 
 

diff --git a/kithara/distributed/sharding/_layout.py b/kithara/distributed/sharding/_layout.py
@@ -14,7 +14,7 @@
  limitations under the License.
  """
 
-from kithara.distributed.sharding.models import GEMMA_LAYOUT
+from kithara.distributed.sharding.models import GEMMA_LAYOUT, LLAMA_LAYOUT
 from dataclasses import dataclass
 from typing import ClassVar
 
@@ -33,6 +33,11 @@ def initialize_mesh_types(cls):
             supported_models.GEMMA2_2B: lambda: Layout.gemma(),
             supported_models.GEMMA2_9B: lambda: Layout.gemma(),
             supported_models.GEMMA2_27B: lambda: Layout.gemma(),
+            supported_models.LLAMA31_8B: lambda: Layout.llama(),
+            supported_models.LLAMA31_70B: lambda: Layout.llama(),
+            supported_models.LLAMA31_405B: lambda: Layout.llama(),
+            supported_models.LLAMA32_1B: lambda: Layout.llama(),
+            supported_models.LLAMA32_3B: lambda: Layout.llama(),
         }
 
     def __class_getitem__(cls, key: str):
@@ -45,3 +50,7 @@ def __class_getitem__(cls, key: str):
     @classmethod
     def gemma(cls):
         return GEMMA_LAYOUT
+
+    @classmethod
+    def llama(cls):
+        return LLAMA_LAYOUT
diff --git a/kithara/distributed/sharding/models/__init__.py b/kithara/distributed/sharding/models/__init__.py
@@ -1 +1,2 @@
 from kithara.distributed.sharding.models.gemma import GEMMA_LAYOUT
+from kithara.distributed.sharding.models.llama import LLAMA_LAYOUT
diff --git a/kithara/distributed/sharding/models/llama.py b/kithara/distributed/sharding/models/llama.py
@@ -1,15 +1,41 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
 from kithara.distributed.sharding._mesh import Axis
 
 LLAMA_FSDP = {
-    "*token_embedding.embeddings.*": (None, Axis.FSDP),
-    "*transformer_layer.*attention.*(query|key|value).kernel*": (
+    ".*token_embedding.embeddings.*": (None, Axis.FSDP),
+    ".*token_embedding.reverse_embeddings.*": (Axis.FSDP, None),
+    ".*transformer_layer.*self_attention.*(query|key|value).kernel*": (
+        None,
+        Axis.FSDP,
+    ),
+    ".*transformer_layer.*self_attention.attention_output.kernel*": (
+        None,
+        None,
+        Axis.FSDP,
+    ),
+    ".*transformer_layer.*feedforward_gate_dense.kernel*": (None, Axis.FSDP),
+    ".*transformer_layer.*feedforward_intermediate_dense.kernel*": (None, Axis.FSDP),
+    ".*transformer_layer.*feedforward_output_dense.kernel*": (None, Axis.FSDP),
+    # Lora layers
+    ".*transformer_layer.*self_attention.*(query|key|value).lora_kernel.*": (
         None,
         Axis.FSDP,
     ),
-    "*transformer_layer.*attention_output.kernel*": (None, None, Axis.FSDP),
-    "*transformer_layer.*feedforward_gate_dense.kernel*": (None, Axis.FSDP),
-    "*transformer_layer.*feedforward_intermediate_dense.kernel*": (None, Axis.FSDP),
-    ".*decoder_block.*ffw_linear.kernel.*": (Axis.FSDP, None),
 }
 
 LLAMA_LAYOUT = {

diff --git a/kithara/model/hf_compatibility/model_configs.py b/kithara/model/hf_compatibility/model_configs.py
@@ -1,18 +1,18 @@
 """
- Copyright 2025 Google LLC
+Copyright 2025 Google LLC
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-      https://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- """
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 
 import transformers
 from keras_hub.src.utils.preset_utils import load_json
@@ -64,8 +64,6 @@
     rms_norm_eps=1e-5,
     bos_token_id=128000,
     eos_token_id=128001,
-
-    # Additional attributes from your JSON:
     attention_bias=False,
     attention_dropout=0.0,
     hidden_act="silu",
@@ -96,7 +94,16 @@
     max_position_embeddings=131072,
     rms_norm_eps=1e-05,
     bos_token_id=128000,
-    eos_token_id=128001,
+    eos_token_id=[128001, 128008, 128009],
+    rope_scaling={
+        "factor": 8.0,
+        "high_freq_factor": 4.0,
+        "low_freq_factor": 1.0,
+        "original_max_position_embeddings": 8192,
+        "rope_type": "llama3",
+    },
+    rope_theta=500000.0,
+    tie_word_embeddings=False,
 )
 
 llama31_405b_config = transformers.LlamaConfig(
@@ -124,8 +131,6 @@
     rms_norm_eps=1e-5,
     bos_token_id=128000,
     eos_token_id=128001,
-
-    # Additional attributes from your JSON:
     attention_bias=False,
     attention_dropout=0.0,
     hidden_act="silu",
@@ -138,11 +143,10 @@
         "low_freq_factor": 1.0,
         "high_freq_factor": 4.0,
         "original_max_position_embeddings": 8192,
-        "rope_type": "llama3"
+        "rope_type": "llama3",
     },
     rope_theta=500000.0,
     tie_word_embeddings=True,
-    use_cache=True,
 )
 
 llama32_3b_config = transformers.LlamaConfig(
@@ -156,8 +160,6 @@
     rms_norm_eps=1e-5,
     bos_token_id=128000,
     eos_token_id=128001,
-
-    # Additional attributes from your JSON:
     attention_bias=False,
     attention_dropout=0.0,
     hidden_act="silu",
@@ -170,7 +172,7 @@
         "low_freq_factor": 1.0,
         "high_freq_factor": 4.0,
         "original_max_position_embeddings": 8192,
-        "rope_type": "llama3"
+        "rope_type": "llama3",
     },
     rope_theta=500000.0,
     tie_word_embeddings=True,
@@ -198,14 +200,21 @@ def get_model_name_from_preset_handle(preset_handle):
             return supported_models.LLAMA31_70B
         elif n_layers == 126:
             return supported_models.LLAMA31_405B
+        elif n_layers == 16:
+            return supported_models.LLAMA32_1B
+        elif n_layers == 28:
+            return supported_models.LLAMA32_3B
     print(f"model type {model_type} is currently unsupported.")
     return None
 
+
 MODEL_CONFIGS = {
     supported_models.GEMMA2_2B: gemma2_2b_config,
     supported_models.GEMMA2_9B: gemma2_9b_config,
     supported_models.GEMMA2_27B: gemma2_27b_config,
     supported_models.LLAMA31_8B: llama31_8b_config,
     supported_models.LLAMA31_70B: llama31_70b_config,
     supported_models.LLAMA31_405B: llama31_405b_config,
+    supported_models.LLAMA32_1B: llama32_1b_config,
+    supported_models.LLAMA32_3B: llama32_3b_config,
 }
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from kithara.distributed.sharding.models.gemma import GEMMA_LAYOUT
		from kithara.distributed.sharding.models.llama import LLAMA_LAYOUT