In [1]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/dariomazzola/Library/Application Support/sagemaker/config.yaml


In [6]:
try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [3]:
image_uri = get_huggingface_llm_image_uri("huggingface",version="3.2.3")

print(image_uri)

763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.6.0-tgi3.2.3-gpu-py311-cu124-ubuntu22.04


In [7]:
# Crea l'oggetto Model
model = HuggingFaceModel(
    image_uri=image_uri,
    role=role,
    env={
        "HF_TASK": "text-generation",
        "HF_MODEL_ID": "dario-mazzola/gemma-3-finetuned-merged",
        "SM_NUM_GPUS": "1"
    }
)


# Deploy
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge"
)

-------------!

In [8]:
endpoint_name = predictor.endpoint_name
endpoint_name

'huggingface-pytorch-tgi-inference-2025-10-26-18-46-52-371'

In [13]:
def build_gemma3_prompt(system_prompt: str, user_input: str) -> str:
    return (
        "<bos><start_of_turn>user\n"
        f"{system_prompt}\n\n"
        f"Input: {user_input}<end_of_turn>\n"
        "<start_of_turn>model"
    )

system_prompt = """For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)

Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction."""

user_question = "What should I visit in Barcelona?"

prompt = build_gemma3_prompt(system_prompt, user_question)

print(prompt)

<bos><start_of_turn>user
For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)

Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

Input: What should I visit in Barcelona?<end_of_turn>
<start_of_turn>model


In [14]:
import boto3
import json

runtime = boto3.client("sagemaker-runtime")

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps({
        "inputs": prompt
    })
)

In [15]:
response_body = response['Body'].read().decode('utf-8')
response_body

'[{"generated_text":"<bos><start_of_turn>user\\nFor the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)\\n\\nTools can be one of the following:\\nWikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.\\nLLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.\\n\\nInput: What should I visit in Barcelona?<end_of_turn>\\n<start_of_turn>model\\nPlan: Search for tourist

In [16]:
import ast

In [17]:
print(ast.literal_eval(response_body)[0]['generated_text'])

<bos><start_of_turn>user
For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)

Tools can be one of the following:
Wikipedia[input]: Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

Input: What should I visit in Barcelona?<end_of_turn>
<start_of_turn>model
Plan: Search for tourist attractions in Barcelona.
#E1 = Wikipe