In [None]:
def prompt_gemma(system_prompt, user_prompt, max_length=50, temperature=0.7, top_p=0.9):
    """
    Sends a system prompt and user prompt to the Gemma model and retrieves the response.

    Args:
        system_prompt (str): The context or system message for the model.
        user_prompt (str): The input query from the user.
        max_length (int): Maximum length of the generated response (default is 50 tokens).
        temperature (float): Sampling temperature for randomness in generation (default is 0.7).
        top_p (float): Top-p (nucleus) sampling for controlling token probabilities (default is 0.9).

    Returns:
        str: The generated response from the model.
    """
    # Combine the system prompt and user prompt, clearly separating them
    prompt_text = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"

    # Tokenize the input
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)

    # Generate the response with tunable parameters
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the response and strip the input text
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    assistant_response = full_response.split("Assistant:")[-1].strip()
    return assistant_response


In [None]:
# Define the system prompt (context)
system_prompt = (
    "You are an expert in summarizing technical documents. Your goal is to paraphrase input "
    "texts for clarity and conciseness without losing technical details."
)

# Define the user's prompt
user_prompt = "Central Analytics Support & Enablement (CASE) is our enterprise support platform for over 20,000 data scientists, data analysts, MLOps engineers, BI developers, etc. at the bank. CASE includes multiple features such as 'CASEy' the chatbot, intent-driven (i.e., non-GAN/Auto-LLM) chatbot microservices built on MS Copilot Studio. Additionally, CASE includes the Collective Intelligence (CI) framework, conditional ticketing system, and its Graph-enabled search. All of this enables product and support teams to efficiently address customer needs, manage their product knowledge documentation, and handle product inquiries within a unified platform."

# Call the prompt_gemma function
response = prompt_gemma(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    max_length=150,  # Adjust length for longer responses
    temperature=0.6, # Reduce randomness for concise responses
    top_p=0.9        # Use top-p sampling
)

# Print the model's response
print("Gemma's Response:", response)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace 'gpt2' with your preferred model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Ensure the model and inputs are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Explicitly set the padding token if it is not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Set up the prompts
def generate_response(system_prompt, user_prompt, max_length=200, temperature=0.7, top_p=0.9):
    """
    Generate a response using the model with a system prompt and user prompt.

    Args:
        system_prompt (str): The system instruction to guide the conversation.
        user_prompt (str): The user's input for the conversation.
        max_length (int): The maximum length of the generated response.
        temperature (float): Sampling temperature to control randomness.
        top_p (float): Nucleus sampling top-p value.

    Returns:
        str: The generated response from the model.
    """
    # Combine system and user prompts
    combined_prompt = f"System: {system_prompt}\nUser: {user_prompt}\nAssistant:"

    # Tokenize the input
    inputs = tokenizer(combined_prompt, return_tensors="pt", padding=True).to(device)

    # Generate a response
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response
    assistant_response = response.split("Assistant:")[-1].strip()
    return assistant_response

# Example usage
if __name__ == "__main__":
    # Define system and user prompts
    system_prompt = "You are a helpful assistant specialized in construction-related topics."
    user_prompt = "Why is proper site preparation important in construction?"

    # Generate a response
    response = generate_response(system_prompt, user_prompt, max_length=200, temperature=0.9, top_p=0.95)

    print("Assistant Response:")
    print(response)
