In [None]:
def prompt_gemma(system_prompt, user_prompt, max_length=50, temperature=0.7, top_p=0.9):
    """
    Sends a system prompt and user prompt to the Gemma model and retrieves the response.

    Args:
        system_prompt (str): The context or system message for the model.
        user_prompt (str): The input query from the user.
        max_length (int): Maximum length of the generated response (default is 50 tokens).
        temperature (float): Sampling temperature for randomness in generation (default is 0.7).
        top_p (float): Top-p (nucleus) sampling for controlling token probabilities (default is 0.9).

    Returns:
        str: The generated response from the model.
    """
    # Combine the system prompt and user prompt, clearly separating them
    prompt_text = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"

    # Tokenize the input
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)

    # Generate the response with tunable parameters
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the response and strip the input text
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    assistant_response = full_response.split("Assistant:")[-1].strip()
    return assistant_response


In [None]:
# Define the system prompt (context)
system_prompt = (
    "You are an expert in summarizing technical documents. Your goal is to paraphrase input "
    "texts for clarity and conciseness without losing technical details."
)

# Define the user's prompt
user_prompt = "Central Analytics Support & Enablement (CASE) is our enterprise support platform for over 20,000 data scientists, data analysts, MLOps engineers, BI developers, etc. at the bank. CASE includes multiple features such as 'CASEy' the chatbot, intent-driven (i.e., non-GAN/Auto-LLM) chatbot microservices built on MS Copilot Studio. Additionally, CASE includes the Collective Intelligence (CI) framework, conditional ticketing system, and its Graph-enabled search. All of this enables product and support teams to efficiently address customer needs, manage their product knowledge documentation, and handle product inquiries within a unified platform."

# Call the prompt_gemma function
response = prompt_gemma(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    max_length=150,  # Adjust length for longer responses
    temperature=0.6, # Reduce randomness for concise responses
    top_p=0.9        # Use top-p sampling
)

# Print the model's response
print("Gemma's Response:", response)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class GemmaChat:
    def __init__(self, model_name="gemma-local-model-path"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def set_system_prompt(self, system_prompt):
        """Sets the system prompt as a base context for the conversation."""
        self.system_prompt = system_prompt

    def generate_response(self, user_prompt, max_length=200):
        """Generates a response using the system and user prompts."""
        # Combine the system prompt and user prompt
        combined_prompt = f"{self.system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"
        
        # Tokenize the input
        inputs = self.tokenizer(combined_prompt, return_tensors="pt").to(self.device)
        
        # Generate output
        outputs = self.model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )

        # Decode and return the generated text
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the assistant's response (after "Assistant:")
        assistant_response = response.split("Assistant:")[-1].strip()
        return assistant_response

# Example Usage
if __name__ == "__main__":
    gemma = GemmaChat(model_name="path_to_your_local_gemma_model")

    # Set system prompt
    system_prompt = "You are a helpful assistant skilled in answering technical and creative queries."
    gemma.set_system_prompt(system_prompt)

    # User prompt
    user_prompt = "Can you explain how transformers work in machine learning?"

    # Generate and print response
    response = gemma.generate_response(user_prompt)
    print(f"Assistant: {response}")
