# Step 1: Simple Testing of LLaMA encoder
- We will be using the following tech stack

   - HuggingFace Transformers
 
   - Modal GPU credits for local file gpu usage
 
  - EmojiLM dataset

In [1]:
from dotenv import load_dotenv
import huggingface_hub
import os
load_dotenv()
huggingface_hub.notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
# Firstly, grab the LLaMA encoder and the dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import modal
model_name = "meta-llama/Llama-3.2-1B-Instruct"
huggingface_secret = modal.Secret.from_name(
    "huggingface-secret", required_keys=["HF_TOKEN"]
)



volume = modal.Volume.from_name(
    "llama_embeddings", create_if_missing=True
)
MODEL_DIR = "/model"

# Define stubs at module level, outside the class
app = modal.App(name="llama-embeddings")
image = modal.Image.debian_slim(python_version = '3.10').pip_install(["transformers", "torch", "accelerate", "hf_transfer"])
image = image.env(
    {"HF_HUB_ENABLE_HF_TRANSFER": "1"}  # turn on faster downloads from HF
)

@app.function(image=image, gpu="any", secrets=[huggingface_secret], timeout=600, volumes={MODEL_DIR: volume}, memory=16384)
def get_model_function(model_name):
    from transformers import AutoModelForCausalLM
    import os
    
    print(f"Starting to load model: {model_name}")
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    # Log the model being loaded
    print("Loading model... (this may take a few minutes)")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=MODEL_DIR, device_map="cuda")
    print(f"Model loaded successfully!")
    
    # Print model architecture details
    print(f"Model architecture: {model.__class__.__name__}")
    print(f"Number of layers: {len(model.model.layers)}")
    
    # Create serializable information about some sample layers
    layers_info = {
        "num_layers": len(model.model.layers),
        "hidden_size": model.config.hidden_size if hasattr(model.config, "hidden_size") else "Unknown"
    }
    
    # Information for all layers to make it serializable
    layers_info["all_layers"] = []
    for i in range(len(model.model.layers)):
        layer = model.model.layers[i]
        layer_info = {
            "index": i,
            "type": layer.__class__.__name__,
            "parameter_count": sum(p.numel() for p in layer.parameters())
        }
        
        # Extract information about modules in the decoder layer
        if hasattr(layer, "self_attn"):
            layer_info["modules"] = {
                "self_attention": {
                    "type": layer.self_attn.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.self_attn.parameters())
                },
                "mlp": {
                    "type": layer.mlp.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.mlp.parameters())
                }
            }
            if hasattr(layer, "input_layernorm"):
                layer_info["modules"]["input_layernorm"] = {
                    "type": layer.input_layernorm.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.input_layernorm.parameters())
                }
            if hasattr(layer, "post_attention_layernorm"):
                layer_info["modules"]["post_attention_layernorm"] = {
                    "type": layer.post_attention_layernorm.__class__.__name__,
                    "parameter_count": sum(p.numel() for p in layer.post_attention_layernorm.parameters())
                }
        
        layers_info["all_layers"].append(layer_info)
        
        # Print details for the first few layers
        if i < 3:
            print(f"Layer {i}: {layer_info['type']} with {layer_info['parameter_count']:,} parameters")
            if "modules" in layer_info:
                for module_name, module_info in layer_info["modules"].items():
                    print(f"  - {module_name}: {module_info['type']} with {module_info['parameter_count']:,} parameters")
    
    return layers_info


@app.function(image=image, gpu="any", secrets=[huggingface_secret], timeout=600, volumes={MODEL_DIR: volume}, memory=16384)
def get_embeddings(text, model_name):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    
    print(f"Loading model and tokenizer: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=MODEL_DIR, device_map="cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_DIR)
    
    # Get the embedding layer
    embed_layer = model.model.embed_tokens
    
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt").to("cuda")
    
    # Get embeddings for the tokens
    with torch.no_grad():
        embeddings = embed_layer(tokens.input_ids)
    
    # Return both the raw embeddings and some metadata
    return {
        "embeddings": embeddings.cpu().numpy(),
        "tokens": tokens.input_ids.cpu().numpy(),
        "token_strings": tokenizer.convert_ids_to_tokens(tokens.input_ids[0]),
        "embedding_dim": embeddings.shape[-1]
    }

## Do inference on any text:
@app.function(image=image, gpu="any", secrets=[huggingface_secret], timeout=600, volumes={MODEL_DIR: volume}, memory=16384)
def do_inference(text, model_name):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    
    print(f"Loading model and tokenizer: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=MODEL_DIR, device_map="cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_DIR)
    
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt").to("cuda")
    
    # Get the embedding layer
    out = model.generate(tokens.input_ids, max_new_tokens=1024)
    return {'output': tokenizer.decode(out[0], skip_special_tokens=True)}
    
@app.function(image=image, gpu="any", secrets=[huggingface_secret], timeout=600, volumes={MODEL_DIR: volume}, memory=16384)
def get_intermediate_outputs(text, model_name, layer_num=4):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    import numpy as np
    
    print(f"Loading model and tokenizer: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=MODEL_DIR, device_map="cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_DIR)
    
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt").to("cuda")
    
    # Run forward pass with output_hidden_states=True to get intermediate activations
    with torch.no_grad():
        outputs = model(tokens.input_ids, output_hidden_states=True)
    
    # Get the hidden states (these are the intermediate layer outputs)
    hidden_states = outputs.hidden_states
    
    # Print shapes of intermediate states for debugging
    print(f"Number of hidden states: {len(hidden_states)}")
    for i, hidden_state in enumerate(hidden_states):
        print(f"Layer {i} hidden state shape: {hidden_state.shape}")
    
    # Extract the requested layer's output and convert to numpy for serialization
    requested_layer_output = hidden_states[layer_num].cpu().numpy()
    
    # Return a serializable dictionary with the requested layer's output
    return {
        'layer_num': layer_num,
        'layer_output_shape': requested_layer_output.shape,
        'layer_output': requested_layer_output,
        'token_strings': tokenizer.convert_ids_to_tokens(tokens.input_ids[0]),
        'generated_text': tokenizer.decode(model.generate(tokens.input_ids, max_new_tokens=100)[0], skip_special_tokens=True)
    }


/tmp/ipykernel_31971/90382873.py:24: DeprecationError: 2025-02-03: Modal will stop implicitly adding local Python modules to the Image ("automounting") in a future update. The following modules need to be explicitly added for future compatibility:
* _remote_module_non_scriptable

e.g.:
image_with_source = my_image.add_local_python_source("_remote_module_non_scriptable")

For more information, see https://modal.com/docs/guide/modal-1-0-migration
  def get_model_function(model_name):
/tmp/ipykernel_31971/90382873.py:92: DeprecationError: 2025-02-03: Modal will stop implicitly adding local Python modules to the Image ("automounting") in a future update. The following modules need to be explicitly added for future compatibility:
* _remote_module_non_scriptable

e.g.:
image_with_source = my_image.add_local_python_source("_remote_module_non_scriptable")

For more information, see https://modal.com/docs/guide/modal-1-0-migration
  def get_embeddings(text, model_name):
/tmp/ipykernel_31971/903

In [13]:
# For Jupyter Notebook execution
try:
    # Use a short app name to avoid potential path issues in Jupyter
    with app.run():
        print("LLaMA Model initialized")
        result = get_model_function.remote(model_name)
        print("\nResult summary:")
        print(f"Model has {result['num_layers']} layers with hidden size {result['hidden_size']}")
        
        # Display information about the first few layers
        print("\nLayer details:")
        for layer_info in result["all_layers"][:3]:
            print(f"  Layer {layer_info['index']}: {layer_info['type']} with {layer_info['parameter_count']:,} parameters")
            if "modules" in layer_info:
                for module_name, module_info in layer_info["modules"].items():
                    print(f"    - {module_name}: {module_info['type']} with {module_info['parameter_count']:,} parameters")
finally:
    # Force app to clean up in case of interrupts or cell re-execution
    print("Ensuring app is shut down...")


LLaMA Model initialized

Result summary:
Model has 16 layers with hidden size 2048

Layer details:
  Layer 0: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
  Layer 1: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
  Layer 2: LlamaDecoderLayer with 60,821,504 parameters
    - self_attention: LlamaAttention with 10,485,760 parameters
    - mlp: LlamaMLP with 50,331,648 parameters
    - input_layernorm: LlamaRMSNorm with 2,048 parameters
    - post_attention_layernorm: LlamaRMSNorm with 2,048 parameters
Ensuring app is shut down

In [21]:
# For Jupyter Notebook execution
# Define the variable outside the context manager scope
embedding_result = None
inference_result = None
try:
    with app.run():
        # Test with a sample text
        sample_text = "What sentence does the sequence of emoji represents:👩‍⚕️😷😢😄👥🏥🚑💔💉📋"
        result = get_embeddings.remote(sample_text, model_name)
        
        # Store the result in the outer variable
        embedding_result = result
        
        print("\nEmbedding Results:")
        print(f"Input text: '{sample_text}'")
        print(f"Embedding dimension: {result['embedding_dim']}")
        print(f"Number of tokens: {len(result['token_strings'])}")
        print("\nTokens:")
        for token, embedding in zip(result['token_strings'], result['embeddings'][0]):
            print(f"Token: {token}")
            print(f"Embedding shape: {embedding.shape}")
            print(f"First few values: {embedding[:5]}\n")
        inference_result = do_inference.remote(sample_text, model_name)
        print(f"Inference result: {inference_result['output']}")
finally:
    print("App is closing")
    
            


Embedding Results:
Input text: 'What sentence does the sequence of emoji represents:👩‍⚕️😷😢😄👥🏥🚑💔💉📋'
Embedding dimension: 2048
Number of tokens: 40

Tokens:
Token: <|begin_of_text|>
Embedding shape: (2048,)
First few values: [ 0.00268555  0.00308228 -0.00680542  0.04199219 -0.00265503]

Token: What
Embedding shape: (2048,)
First few values: [ 0.02307129 -0.01190186 -0.00234985  0.02587891 -0.00546265]

Token: Ġsentence
Embedding shape: (2048,)
First few values: [ 0.0135498  -0.01525879  0.03222656  0.00601196 -0.01696777]

Token: Ġdoes
Embedding shape: (2048,)
First few values: [0.01965332 0.01507568 0.00350952 0.00927734 0.02246094]

Token: Ġthe
Embedding shape: (2048,)
First few values: [-0.00805664 -0.01501465  0.02600098 -0.03808594 -0.00891113]

Token: Ġsequence
Embedding shape: (2048,)
First few values: [-0.02563477  0.00588989 -0.0189209   0.08496094 -0.01672363]

Token: Ġof
Embedding shape: (2048,)
First few values: [-0.00193787 -0.01403809  0.00527954 -0.02783203  0.00915527]



In [15]:
print(embedding_result['token_strings'])

['<|begin_of_text|>', 'What', 'Ġdoes', 'Ġthis', 'Ġemoji', 'Ġmean', '?', 'ðŁ', 'ĳ', '©', 'âĢį', 'â', 'ļ', 'ķ', 'ï¸ı', 'ðŁĺ', '·', 'ðŁĺ', '¢', 'ðŁĺ', 'Ħ', 'ðŁ', 'ĳ', '¥', 'ðŁ', 'ı', '¥', 'ðŁ', 'ļ', 'ĳ', 'ðŁĴ', 'Ķ', 'ðŁĴ', 'ī', 'ðŁ', 'ĵ', 'ĭ']


In [22]:
print(inference_result['output'])

What sentence does the sequence of emoji represents:👩‍⚕️😷😢😄👥🏥🚑💔💉📋

The sequence of emojis represents a patient's journey through a hospital. Here's a breakdown of each emoji:

1. 👩‍⚕️ - A female doctor (representing the medical staff)
2. 😷 - A face with tears (representing the patient's emotional state)
3. 😢 - A crying face (representing the patient's sadness)
4. 😄 - A smiling face (representing the patient's recovery)
5. 👥 - A group of people (representing the hospital staff and the patient's support system)
6. 🏥 - A hospital (representing the physical environment)
7. 🚑 - A ambulance (representing the emergency services)
8. 💔 - A broken heart (representing the emotional impact of the illness)
9. 💉 - A syringe (representing the medical treatment)
10. 📋 - A doctor's notebook (representing the patient's recovery process)

The sequence of emojis represents a patient's journey through the hospital, from initial illness to recovery and back to normal life. The doctor and hospital staff are 

In [34]:
# For Jupyter Notebook execution
intermediate_result = None
try:
    with app.run():
        # Test with a sample text
        sample_text = "Hello, world!"
        layer_num = 4  # Get outputs after the 4th layer
        result = get_intermediate_outputs.remote(sample_text, model_name, layer_num)
        intermediate_result = result
        print("\nIntermediate Layer Results:")
        print(f"Input text: '{sample_text}'")
        print(f"Layer number: {result['layer_num']}")
        print(f"Layer output shape: {result['layer_output_shape']}")
        print(f"Number of tokens: {len(result['token_strings'])}")
        print("\nTokens:")
        for i, token in enumerate(result['token_strings']):
            print(f"\nToken: {token}")
            token_vector = result['layer_output'][0, i]
            print(f"Vector shape: {token_vector.shape}")
            print(f"First few values: {token_vector[:5]}")
        
        print(f"\nGenerated text: {result['generated_text']}")
            
finally:
    print("Ensuring app is shut down...")


Intermediate Layer Results:
Input text: 'Hello, world!'
Layer number: 4
Layer output shape: (1, 5, 2048)
Number of tokens: 5

Tokens:

Token: <|begin_of_text|>
Vector shape: (2048,)
First few values: [ 0.12739815 -0.501549    1.5829394  -0.34711745  0.48629034]

Token: Hello
Vector shape: (2048,)
First few values: [0.05333915 0.01220436 0.02671797 0.01382664 0.16754065]

Token: ,
Vector shape: (2048,)
First few values: [ 0.07111846 -0.01006559  0.1203342   0.04470684  0.10219292]

Token: Ġworld
Vector shape: (2048,)
First few values: [ 0.01662472 -0.02124677 -0.03644049 -0.02957273  0.06016726]

Token: !
Vector shape: (2048,)
First few values: [-0.04820976 -0.03211864 -0.02830466  0.08335431 -0.02354628]

Generated text: Hello, world! I'm excited to be here. My name is Max, and I'm a software engineer with a passion for building scalable and efficient systems.

As a seasoned developer, I've had the privilege of working on various projects, from mobile apps to web applications, and I'm

In [36]:
intermediate_result["layer_output"].shape

(1, 5, 2048)