# Step 1: Simple Testing of LLaMA encoder
- We will be using the following tech stack

   - HuggingFace Transformers
 
   - Modal GPU credits for local file gpu usage
 
  - EmojiLM dataset

In [1]:
from dotenv import load_dotenv
import huggingface_hub
import os
load_dotenv()
huggingface_hub.notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# Firstly, grab the LLaMA encoder and the dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import modal
model_name = "meta-llama/Llama-3.2-1B-Instruct"
huggingface_secret = modal.Secret.from_name(
    "huggingface-secret", required_keys=["HF_TOKEN"]
)



volume = modal.Volume.from_name(
    "llama_embeddings", create_if_missing=True
)
MODEL_DIR = "/model"

# Define stubs at module level, outside the class
app = modal.App(name="llama-embeddings")
image = modal.Image.debian_slim(python_version = '3.10').pip_install(["transformers", "torch", "accelerate", "hf_transfer"])
image = image.env(
    {"HF_HUB_ENABLE_HF_TRANSFER": "1"}  # turn on faster downloads from HF
)

@app.function(image=image, gpu="any", secrets=[huggingface_secret], timeout=600, volumes={MODEL_DIR: volume}, memory=16384)
def get_model_function(model_name):
    from transformers import AutoModelForCausalLM
    import os
    
    print(f"Starting to load model: {model_name}")
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    # Log the model being loaded
    print("Loading model... (this may take a few minutes)")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=MODEL_DIR, device_map="cuda")
    print(f"Model loaded successfully!")
    
    # Print model architecture details
    print(f"Model architecture: {model.__class__.__name__}")
    print(f"Number of layers: {len(model.model.layers)}")
    
    # Create serializable information about some sample layers
    layers_info = {
        "num_layers": len(model.model.layers),
        "hidden_size": model.config.hidden_size if hasattr(model.config, "hidden_size") else "Unknown"
    }
    
    # Information for all layers to make it serializable
    layers_info["all_layers"] = []
    for i in range(len(model.model.layers)):
        layer = model.model.layers[i]
        layer_info = {
            "index": i,
            "type": layer.__class__.__name__,
            "parameter_count": sum(p.numel() for p in layer.parameters())
        }
        
        # Extract information about modules in the decoder layer
        if hasattr(layer, "self_attn"):
            layer_info["modules"] = {
                "self_attention": {
                    "type": layer.self_attn.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.self_attn.parameters())
                },
                "mlp": {
                    "type": layer.mlp.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.mlp.parameters())
                }
            }
            if hasattr(layer, "input_layernorm"):
                layer_info["modules"]["input_layernorm"] = {
                    "type": layer.input_layernorm.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.input_layernorm.parameters())
                }
            if hasattr(layer, "post_attention_layernorm"):
                layer_info["modules"]["post_attention_layernorm"] = {
                    "type": layer.post_attention_layernorm.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.post_attention_layernorm.parameters())
                }
        
        layers_info["all_layers"].append(layer_info)
        
        # Print details for the first few layers
        if i < 3:
            print(f"Layer {i}: {layer_info['type']} with {layer_info['parameter_count']:,} parameters")
            if "modules" in layer_info:
                for module_name, module_info in layer_info["modules"].items():
                    print(f"  - {module_name}: {module_info['type']} with {module_info['parameter_count']:,} parameters")
    
    return layers_info


/tmp/ipykernel_14289/467177460.py:24: DeprecationError: 2025-02-03: Modal will stop implicitly adding local Python modules to the Image ("automounting") in a future update. The following modules need to be explicitly added for future compatibility:
* _remote_module_non_scriptable

e.g.:
image_with_source = my_image.add_local_python_source("_remote_module_non_scriptable")

For more information, see https://modal.com/docs/guide/modal-1-0-migration
  def get_model_function(model_name):


In [12]:
# For Jupyter Notebook execution
try:
    # Use a short app name to avoid potential path issues in Jupyter
    with app.run():
        print("LLaMA Model initialized")
        result = get_model_function.remote(model_name)
        print("\nResult summary:")
        print(f"Model has {result['num_layers']} layers with hidden size {result['hidden_size']}")
        
        # Display information about the first few layers
        print("\nLayer details:")
        for layer_info in result["all_layers"][:3]:
            print(f"  Layer {layer_info['index']}: {layer_info['type']} with {layer_info['parameter_count']:,} parameters")
            if "modules" in layer_info:
                for module_name, module_info in layer_info["modules"].items():
                    print(f"    - {module_name}: {module_info['type']} with {module_info['parameter_count']:,} parameters")
finally:
    # Force app to clean up in case of interrupts or cell re-execution
    print("Ensuring app is shut down...")


LLaMA Model initialized

Result summary:
Model has 16 layers with hidden size 2048

Layer details:
  Layer 0: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
  Layer 1: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
  Layer 2: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
Ensuring app is shut down