In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def create_llama_prompt(system_prompt: str, user_prompt: str, eos_token: str="<|endoftext|>"):
    """
    Creates a formatted prompt to use with LLaMA models that include system-level and user-level prompts,
    with the specified EOS token for defining prompt boundaries.
    
    Parameters:
    system_prompt (str): Instructions or context provided to the model as the system-level prompt.
    user_prompt (str): The primary user query or command that requires a response.
    eos_token (str): The end-of-sequence token to mark the end of different sections of the prompt.
    
    Returns:
    str: A concatenated string that includes both system and user prompts, formatted with EOS tokens.
    """
    prompt = f"[SYSTEM]{eos_token} {system_prompt.strip()} {eos_token} [USER]{eos_token} {user_prompt.strip()} {eos_token}"
    return prompt

def generate_response(prompt: str, model_name: str="llama_model", max_new_tokens: int=200):
    """
    Generates a response using a LLaMA model with the given prompt.
    
    Parameters:
    prompt (str): The formatted prompt to pass to the LLaMA model.
    model_name (str): The name or path of the pre-trained model to load.
    max_new_tokens (int): The maximum number of tokens to generate for the response.
    
    Returns:
    str: The generated response from the LLaMA model.
    """
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

    # Tokenize the prompt and generate tokens
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=max_new_tokens, eos_token_id=tokenizer.eos_token_id)

    # Decode the output tokens and extract the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    response = response[len(prompt):].strip()  # Remove the prompt from the response

    return response

if __name__ == "__main__":
    # Example usage:
    system_prompt = "You are an assistant who answers questions helpfully and politely."
    user_prompt = "What is the difference between machine learning and deep learning?"
    
    # Create the formatted prompt
    formatted_prompt = create_llama_prompt(system_prompt, user_prompt)
    
    # Generate a response using a LLaMA model
    model_name = "decapoda-research/llama-7b-hf"  # Placeholder model path/name
    response = generate_response(formatted_prompt, model_name)
    
    # Print the generated response
    print("Response:")
    print(response)
