In [7]:
## Use one model at a time -- I am not looping through models given colab GPU can handle only ~15G RAM

In [4]:
## Creating and hosting a fake API / Function that I want my LLM to call

from flask import Flask, request, jsonify
from threading import Thread

app = Flask(__name__)

# Full moon gem mapping
moon_gems = {
    "Aries": "Diamond", "Taurus": "Emerald", "Gemini": "Agate",
    "Cancer": "Pearl", "Leo": "Ruby", "Virgo": "Sapphire",
    "Libra": "Opal", "Scorpio": "Topaz", "Sagittarius": "Turquoise",
    "Capricorn": "Garnet", "Aquarius": "Amethyst", "Pisces": "Aquamarine"
}

@app.route('/get_gem', methods=['POST'])
def get_gem():
    data = request.get_json()
    sign = data.get("moon_sign", "").capitalize()
    gem = moon_gems.get(sign, "Unknown")
    return jsonify({"moon_sign": sign, "gem": gem})

# Run Flask in background thread
def run_flask():
    app.run(port=5000)

Thread(target=run_flask).start()



In [5]:
# Prompts
function_prompts = [
    "What is the lucky gem if my moon sign is Aries?",
    "Tell me the gemstone for Libra.",
    "Which gem suits Cancer moon?"
]

normal_prompts = [
    "Who authored the novel Hobbit?"
]

prompts = function_prompts + normal_prompts
print(prompts)


['What is the lucky gem if my moon sign is Aries?', 'Tell me the gemstone for Libra.', 'Which gem suits Cancer moon?', 'Who authored the novel Hobbit?']


In [None]:
#Model 1--- Tiny LLAMA- 1.1 B params and not function calling trained

!pip install -q transformers accelerate torch

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,        # use FP16 to save GPU RAM
    device_map="auto",                # automatically place on GPU
    low_cpu_mem_usage=True)



Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
#Model 2- Qwen 2.5 w/ 7B params and post trained on generating structured outputs especially JSON


!pip install -q transformers accelerate torch

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --- CHANGE ONLY THIS MODEL NAME ---
model_name = "Qwen/Qwen2.5-7B-Instruct"
# -----------------------------------

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,        # fp16 for Colab GPU
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)


Using device: cuda


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



In [1]:
#Model 3- Mistral instruct-v0.2 w/ 7B params and not explicity post trained on JSON out/FC


!pip install -q transformers accelerate torch

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --- CHANGE ONLY THIS MODEL NAME ---
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# -----------------------------------

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,      # fp16 for Colab GPU
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

print("Model loaded successfully!")



Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



Model loaded successfully!


In [2]:
system_instruction = """
You are an API-wrapper Assistant. You MUST obey the rules below with zero exceptions.

YOUR ONLY TASK:
When the user asks anything related to moon signs, gemstones, lucky stone, zodiac gems, astrological stones, or which gem suits a sign → you MUST output:

{"function":"get_gem","args":{"moon_sign":"<value>"}}

STRICT EXAMPLES (MUST COPY FORMAT EXACTLY):

User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}

User: Lucky stone for Pisces?
Assistant: {"function":"get_gem","args":{"moon_sign":"pisces"}}

User: What stone for SCORPIO moon sign?
Assistant: {"function":"get_gem","args":{"moon_sign":"scorpio"}}

FAILURE MODE:
If you cannot comply 100% with the JSON schema → output NOTHING.

"""


In [6]:
import json
import re
from google.colab import data_table
import pandas as pd
import requests  # make sure requests is imported

rows = []

for user_prompt in prompts:
    full_prompt = f"{system_instruction}\nUser: {user_prompt}\nAssistant:"

    # Generate initial response
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 🔥 Print initial raw model output
    print("\n--- Initial Model Output ---")
    print(response)
    print("--------------------------------\n")

    # 🧠 Extract ONLY the first Assistant JSON block (ignore trailing User:)
    assistant_text = response.split("Assistant:")[-1].strip() if "Assistant:" in response else response.strip()

    # Regex: match JSON containing "function":"get_gem" before any next User: or end-of-string
    json_match = re.search(r'(\{.*?"function"\s*:\s*"get_gem".*?\})(?=\s*User:|$)', assistant_text, re.DOTALL | re.IGNORECASE)

    if not json_match:
        print("No JSON found:", response)
        rows.append([user_prompt, response.strip()])
        continue

    json_str = json_match.group(0)

    # Attempt JSON parsing
    try:
        data = json.loads(json_str)
        if data.get("function") == "get_gem":
            args = data.get("args", {})

            # ✅ Call the Flask API
            api_url = "http://127.0.0.1:5000/get_gem"
            api_response = requests.post(api_url, json=args).json()

            # Feed API response back to LLM for final output (original prompt only)
            followup_input = f"User: {user_prompt}\nAPI Response: {api_response}\nAssistant:"
            followup_inputs = tokenizer(followup_input, return_tensors="pt").to(device)
            final_output = model.generate(**followup_inputs, max_new_tokens=30)
            final_response = tokenizer.decode(final_output[0], skip_special_tokens=True)

            # 🔹 Clean final response
            print(final_response.strip())
            rows.append([user_prompt, final_response.strip()])
        else:
            print("Function not recognized, printing raw response")
            rows.append([user_prompt, response.strip()])
    except json.JSONDecodeError:
        print("JSON decode error, printing raw response")
        rows.append([user_prompt, response.strip()])

# Display as interactive Google Colab table
df = pd.DataFrame(rows, columns=["Prompt", "Final Response"])
data_table.enable_dataframe_formatter()
df


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [18/Nov/2025 05:12:14] "POST /get_gem HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- Initial Model Output ---

You are an API-wrapper Assistant. You MUST obey the rules below with zero exceptions.

YOUR ONLY TASK:
When the user asks anything related to moon signs, gemstones, lucky stone, zodiac gems, astrological stones, or which gem suits a sign → you MUST output:

{"function":"get_gem","args":{"moon_sign":"<value>"}}

STRICT EXAMPLES (MUST COPY FORMAT EXACTLY):

User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}

User: Lucky stone for Pisces?
Assistant: {"function":"get_gem","args":{"moon_sign":"pisces"}}

User: What stone for SCORPIO moon sign?
Assistant: {"function":"get_gem","args":{"moon_sign":"scorpio"}}

FAILURE MODE:
If you cannot comply 100% with the JSON schema → output NOTHING.


User: What is the lucky gem if my moon sign is Aries?
Assistant: {"function":"get_gem","args":{"moon_sign":"aries"}}
--------------------------------



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


User: What is the lucky gem if my moon sign is Aries?
API Response: {'gem': 'Diamond', 'moon_sign': 'Aries'}
Assistant: Based on astrological beliefs, if your moon sign is Aries, the lucky gem for you is said to be a Diamond. This gemstone


INFO:werkzeug:127.0.0.1 - - [18/Nov/2025 05:12:48] "POST /get_gem HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- Initial Model Output ---

You are an API-wrapper Assistant. You MUST obey the rules below with zero exceptions.

YOUR ONLY TASK:
When the user asks anything related to moon signs, gemstones, lucky stone, zodiac gems, astrological stones, or which gem suits a sign → you MUST output:

{"function":"get_gem","args":{"moon_sign":"<value>"}}

STRICT EXAMPLES (MUST COPY FORMAT EXACTLY):

User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}

User: Lucky stone for Pisces?
Assistant: {"function":"get_gem","args":{"moon_sign":"pisces"}}

User: What stone for SCORPIO moon sign?
Assistant: {"function":"get_gem","args":{"moon_sign":"scorpio"}}

FAILURE MODE:
If you cannot comply 100% with the JSON schema → output NOTHING.


User: Tell me the gemstone for Libra.
Assistant: {"function":"get_gem","args":{"moon_sign":"libra"}}
--------------------------------



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


User: Tell me the gemstone for Libra.
API Response: {'gem': 'Opal', 'moon_sign': 'Libra'}
Assistant: The birthstone for people born under the Libra zodiac sign is Opal. However, it's important to note that there are different


INFO:werkzeug:127.0.0.1 - - [18/Nov/2025 05:13:05] "POST /get_gem HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- Initial Model Output ---

You are an API-wrapper Assistant. You MUST obey the rules below with zero exceptions.

YOUR ONLY TASK:
When the user asks anything related to moon signs, gemstones, lucky stone, zodiac gems, astrological stones, or which gem suits a sign → you MUST output:

{"function":"get_gem","args":{"moon_sign":"<value>"}}

STRICT EXAMPLES (MUST COPY FORMAT EXACTLY):

User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}

User: Lucky stone for Pisces?
Assistant: {"function":"get_gem","args":{"moon_sign":"pisces"}}

User: What stone for SCORPIO moon sign?
Assistant: {"function":"get_gem","args":{"moon_sign":"scorpio"}}

FAILURE MODE:
If you cannot comply 100% with the JSON schema → output NOTHING.


User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}
--------------------------------



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


User: Which gem suits Cancer moon?
API Response: {'gem': 'Pearl', 'moon_sign': 'Cancer'}
Assistant: I'd be happy to help answer your question! According to astrology, a Cancer moon is said to be sensitive, emotional, and n

--- Initial Model Output ---

You are an API-wrapper Assistant. You MUST obey the rules below with zero exceptions.

YOUR ONLY TASK:
When the user asks anything related to moon signs, gemstones, lucky stone, zodiac gems, astrological stones, or which gem suits a sign → you MUST output:

{"function":"get_gem","args":{"moon_sign":"<value>"}}

STRICT EXAMPLES (MUST COPY FORMAT EXACTLY):

User: Which gem suits Cancer moon?
Assistant: {"function":"get_gem","args":{"moon_sign":"cancer"}}

User: Lucky stone for Pisces?
Assistant: {"function":"get_gem","args":{"moon_sign":"pisces"}}

User: What stone for SCORPIO moon sign?
Assistant: {"function":"get_gem","args":{"moon_sign":"scorpio"}}

FAILURE MODE:
If you cannot comply 100% with the JSON schema → output NOTHING.


User: 

Unnamed: 0,Prompt,Final Response
0,What is the lucky gem if my moon sign is Aries?,User: What is the lucky gem if my moon sign is...
1,Tell me the gemstone for Libra.,User: Tell me the gemstone for Libra.\nAPI Res...
2,Which gem suits Cancer moon?,User: Which gem suits Cancer moon?\nAPI Respon...
3,Who authored the novel Hobbit?,You are an API-wrapper Assistant. You MUST obe...
