In [2]:
#Imported libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import re

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/boston-rideshare-agent/src')

Mounted at /content/drive


In [3]:
#Model type
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
LOAD_8BIT = False
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

In [4]:
#Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

#Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=DTYPE,
    trust_remote_code=True,
    attn_implementation="eager"
)

model.eval()

gen_cfg = GenerationConfig(
    max_new_tokens=160,
    temperature=0.1,
    top_p=0.9,
    do_sample=True
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [5]:
def generate_text(prompt: str, max_new_tokens: int = 80) -> str:  # ← Reduced from 160
    """
    Generate text from language model.
    Limited to 80 tokens to prevent rambling.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            generation_config=gen_cfg,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id  # ← Added for cleaner stopping
        )

    completion = tokenizer.decode(
        output_ids[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )

    return completion

In [6]:
def _postprocess_to_two_lines(completion: str) -> str:
    """
    Extract exactly Thought and Action lines.
    Handle cases where model generates extra content.
    """
    lines = completion.strip().split('\n')

    thought_line = None
    action_line = None

    for line in lines:
        line = line.strip()

        # Get first Thought line
        if line.startswith('Thought:') and thought_line is None:
            thought_line = line
            # Only take content up to any "or" or extra punctuation
            if ' or ' in thought_line:
                thought_line = thought_line.split(' or ')[0]

        # Get first Action line
        elif line.startswith('Action:') and action_line is None:
            action_line = line
            # Remove any " or ..." suffixes
            if ' or ' in action_line:
                action_line = action_line.split(' or ')[0]
            # If action ends weirdly, clean it up
            if action_line.endswith(' or'):
                action_line = action_line[:-3]

    # If we have both, return them
    if thought_line and action_line:
        return f"{thought_line}\n{action_line}"

    # If only Action, add a generic Thought
    if action_line and not thought_line:
        return f"Thought: I will take the following action.\n{action_line}"

    # Fallback
    return completion[:200]  # Just return first 200 chars

In [7]:
#Enhancing the generation
def generate_action(prompt: str) -> str:
    """
    Generate Thought + Action from prompt and returns a cleaned two-line output.
    """
    raw_output = generate_text(prompt, max_new_tokens=160)
    cleaned = _postprocess_to_two_lines(raw_output)
    return cleaned

In [8]:
import importlib.util
import sys

# Load module directly from file path
spec = importlib.util.spec_from_file_location(
    "prompting",
    "/content/drive/MyDrive/boston-rideshare-agent/src/prompting.py"
)
prompting = importlib.util.module_from_spec(spec)
sys.modules["prompting"] = prompting
spec.loader.exec_module(prompting)

# Now extract what we need
make_prompt = prompting.make_prompt
parse_action = prompting.parse_action
SYSTEM_PREAMBLE = prompting.SYSTEM_PREAMBLE


In [9]:
# Create a rideshare query
user_query = "I need to go from Back Bay to Financial District on Monday morning. Should I use Uber or Lyft?"
trajectory = []

# Build prompt
prompt = make_prompt(user_query, trajectory)

print(f"User Query: {user_query}\n")
print("Prompt (last 300 chars):")
print("..." + prompt[-300:])

print("\n" + "-" * 80)
print("Generating LLM response...")
print("-" * 80 + "\n")

# Generate
llm_output = generate_action(prompt)

print("LLM Output:")
print(llm_output)

# Try to parse the action
lines = llm_output.split('\n')
if len(lines) >= 2:
    action_parsed = parse_action(lines[1])
    print(f"\n✓ Parsed Action: {action_parsed}")
else:
    print("\n⚠️  Output doesn't have expected format")

User Query: I need to go from Back Bay to Financial District on Monday morning. Should I use Uber or Lyft?

Prompt (last 300 chars):
...

Example 3:
Thought: I should check Friday evening patterns for surge pricing.
Action: search[query="Friday evening surge", k=5]

Now respond for the user's question.

User Question: I need to go from Back Bay to Financial District on Monday morning. Should I use Uber or Lyft?


Next step:
Thought:

--------------------------------------------------------------------------------
Generating LLM response...
--------------------------------------------------------------------------------

LLM Output:
Thought: I will take the following action.
Action: search[query="Back Bay Financial District", k=3]

✓ Parsed Action: ('search', {'query': 'Back Bay Financial District', 'k': 3})


In [10]:
trajectory_with_search = [
    {
        "thought": "I should search for trips from Back Bay to Financial District on Monday mornings.",
        "action": "search[query=\"Back Bay Financial District Monday morning\", k=3]",
        "observation": '{"results": [{"title": "Back Bay → Financial District", "price": 12.50, "cab_type": "Uber", "surge": 1.0, "distance": 1.8}]}'
    }
]

# Build prompt with history
prompt_2 = make_prompt(user_query, trajectory_with_search)

print("Prompt includes search results.\n")
print("Generating LLM response...")

llm_output_2 = generate_action(prompt_2)

print("\nLLM Output:")
print(llm_output_2)

lines_2 = llm_output_2.split('\n')
if len(lines_2) >= 2:
    action_parsed_2 = parse_action(lines_2[1])
    print(f"\n✓ Parsed Action: {action_parsed_2}")

    # Check if it's a finish action
    if action_parsed_2 and action_parsed_2[0] == 'finish':
        print(f"\n✅ Final Answer: {action_parsed_2[1].get('answer', 'N/A')}")

Prompt includes search results.

Generating LLM response...

LLM Output:
Thought: I will take the following action.
Action: finish[answer="Uber averages $15, Lyft $18. Recommend Uber."]

✓ Parsed Action: ('finish', {'answer': 'Uber averages $15', 'Lyft $18. Recommend Uber."': True})

✅ Final Answer: Uber averages $15


In [11]:
user_query_3 = "Going from Haymarket to Fenway Park on Saturday night for a concert. Which rideshare is better?"
trajectory_3 = []

prompt_3 = make_prompt(user_query_3, trajectory_3)
llm_output_3 = generate_action(prompt_3)

print(f"User Query: {user_query_3}\n")
print("LLM Output:")
print(llm_output_3)

# Parse
lines_3 = llm_output_3.split('\n')
if len(lines_3) >= 2:
    action_parsed_3 = parse_action(lines_3[1])
    print(f"\n✓ Parsed Action: {action_parsed_3}")

User Query: Going from Haymarket to Fenway Park on Saturday night for a concert. Which rideshare is better?

LLM Output:
Thought: I will take the following action.
Action: search[query="Haymarket Fenway Park Saturday night", k=3]

✓ Parsed Action: ('search', {'query': 'Haymarket Fenway Park Saturday night', 'k': 3})


In [12]:
# Creating the LLM Class
class HF_LLM:
    """
    Wrapper class for Hugging Face language model.

    Handles model loading, prompt formatting, and generation.
    Following the professor's example structure.
    """

    def __init__(self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct",
                 load_8bit: bool = False,
                 dtype=None,
                 max_new_tokens: int = 160,
                 generation_kwargs: dict = None):
        """
        Initialize the language model.

        Args:
            model_name: Hugging Face model identifier
            load_8bit: Whether to load in 8-bit mode (saves memory)
            dtype: Data type for model weights
            max_new_tokens: Maximum tokens to generate
            generation_kwargs: Additional generation parameters
        """
        self.model_name = model_name
        self.load_8bit = load_8bit
        self.dtype = dtype if dtype else (torch.bfloat16 if torch.cuda.is_available() else torch.float32)
        self.max_new_tokens = max_new_tokens
        self.generation_kwargs = generation_kwargs or {}

        # Load tokenizer
        print(f"Loading tokenizer from {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )

        # Load model
        print(f"Loading model from {model_name}...")
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map="auto",
            torch_dtype=self.dtype,
            trust_remote_code=True,
            attn_implementation="eager",
            load_in_8bit=load_8bit if self.load_8bit else False
        )
        self.model.eval()

        # Configure generation
        self.gen_cfg = GenerationConfig(
            max_new_tokens=self.max_new_tokens,
            temperature=self.generation_kwargs.get("temperature", 0.3),
            top_p=self.generation_kwargs.get("top_p", 0.9),
            do_sample=self.generation_kwargs.get("do_sample", True)
        )

        print(f"✓ Model loaded successfully!")
        print(f"✓ Device: {self.model.device}")
        print(f"✓ Parameters: {sum(p.numel() for p in self.model.parameters()):,}")

    def __call__(self, prompt: str) -> str:
        """
        Generate response from prompt.

        Args:
            prompt: Input prompt string

        Returns:
            Generated text (Thought + Action lines)
        """
        # Tokenize
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        # Generate
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                generation_config=self.gen_cfg
            )

        # Decode (remove input prompt)
        completion = self.tokenizer.decode(
            output_ids[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        )

        # Post-process to extract two lines
        return _postprocess_to_two_lines(completion)

In [13]:
# Initialize the LLM
print("\nInitializing LLM...")
llm = HF_LLM(
    model_name="Qwen/Qwen2.5-0.5B-Instruct",
    max_new_tokens=160,
    generation_kwargs={"temperature": 0.3, "do_sample": True}
)


Initializing LLM...
Loading tokenizer from Qwen/Qwen2.5-0.5B-Instruct...
Loading model from Qwen/Qwen2.5-0.5B-Instruct...
✓ Model loaded successfully!
✓ Device: cuda:0
✓ Parameters: 494,032,768


In [14]:
# Create a real rideshare prompt

test_user_query = "Northeastern to Logan Airport on Friday at 3pm. Uber or Lyft?"
test_trajectory = []

test_prompt = make_prompt(test_user_query, test_trajectory)

print(f"User Query: {test_user_query}\n")
print("Calling LLM...")

# Call the LLM
llm_response = llm(test_prompt)

print("\nLLM Generated:")
print(llm_response)

# Parse the action
response_lines = llm_response.split('\n')
if len(response_lines) >= 2:
    parsed_action = parse_action(response_lines[1])
    print(f"\n✓ Parsed Action: {parsed_action}")

    if parsed_action:
        action_name, action_args = parsed_action
        print(f"   • Action: {action_name}")
        print(f"   • Arguments: {action_args}")

User Query: Northeastern to Logan Airport on Friday at 3pm. Uber or Lyft?

Calling LLM...

LLM Generated:
Thought: I will take the following action.
Action: search[query="Northeastern Logan Airport 3pm", k=3]

✓ Parsed Action: ('search', {'query': 'Northeastern Logan Airport 3pm', 'k': 3})
   • Action: search
   • Arguments: {'query': 'Northeastern Logan Airport 3pm', 'k': 3}


In [15]:
test_queries = [
    "Back Bay to Financial District Monday 9am - price comparison?",
    "Theatre District to Fenway during Red Sox game - surge?",
    "Haymarket to North Station late night weekend - which is safer?"
]

for i, query in enumerate(test_queries, 1):
    print(f"\n--- Query {i} ---")
    print(f"User: {query}")

    prompt = make_prompt(query, [])
    response = llm(prompt)

    print(f"\nLLM Response:")
    print(response)

    # Try to parse
    lines = response.split('\n')
    if len(lines) >= 2:
        action = parse_action(lines[1])
        if action:
            print(f"✓ Action: {action[0]} with args {action[1]}")



--- Query 1 ---
User: Back Bay to Financial District Monday 9am - price comparison?

LLM Response:
Thought: I will take the following action.
Action: search[query="Back Bay Financial District", k=3]
✓ Action: search with args {'query': 'Back Bay Financial District', 'k': 3}

--- Query 2 ---
User: Theatre District to Fenway during Red Sox game - surge?

LLM Response:
Thought: I will take the following action.
Action: search[query="Theatre District Fenway", k=3]
✓ Action: search with args {'query': 'Theatre District Fenway', 'k': 3}

--- Query 3 ---
User: Haymarket to North Station late night weekend - which is safer?

LLM Response:
Thought: I will take the following action.
Action: search[query="Haymarket North Station late night", k=3]
✓ Action: search with args {'query': 'Haymarket North Station late night', 'k': 3}


In [16]:
# Create src/llm.py
llm_code = '''"""
llm.py - Language Model Interface for Boston Rideshare Agent
Handles loading and generation from Hugging Face models.
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from typing import Dict


def _postprocess_to_two_lines(completion: str) -> str:
    """Extract Thought and Action lines from LLM output."""
    lines = completion.strip().split('\\n')
    thought_line = None
    action_line = None

    for line in lines:
        line = line.strip()
        if line.startswith('Thought:') and thought_line is None:
            thought_line = line
        elif line.startswith('Action:') and action_line is None:
            action_line = line

    result = []
    if thought_line:
        result.append(thought_line)
    if action_line:
        result.append(action_line)

    return '\\n'.join(result) if result else completion


class HF_LLM:
    """Hugging Face Language Model wrapper for agent."""

    def __init__(self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct",
                 load_8bit: bool = False,
                 dtype=None,
                 max_new_tokens: int = 160,
                 generation_kwargs: Dict = None):
        """Initialize language model."""
        self.model_name = model_name
        self.load_8bit = load_8bit
        self.dtype = dtype if dtype else (torch.bfloat16 if torch.cuda.is_available() else torch.float32)
        self.max_new_tokens = max_new_tokens
        self.generation_kwargs = generation_kwargs or {}

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )

        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map="auto",
            torch_dtype=self.dtype,
            trust_remote_code=True,
            attn_implementation="eager",
            load_in_8bit=self.load_8bit if self.load_8bit else False
        )
        self.model.eval()

        # Generation config
        self.gen_cfg = GenerationConfig(
            max_new_tokens=self.max_new_tokens,
            temperature=self.generation_kwargs.get("temperature", 0.3),
            top_p=self.generation_kwargs.get("top_p", 0.9),
            do_sample=self.generation_kwargs.get("do_sample", True)
        )

    def __call__(self, prompt: str) -> str:
        """Generate response from prompt."""
        # Tokenize
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        # Generate
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                generation_config=self.gen_cfg
            )

        # Decode
        completion = self.tokenizer.decode(
            output_ids[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        )

        return _postprocess_to_two_lines(completion)
'''

# Save to file
with open('/content/drive/MyDrive/boston-rideshare-agent/src/llm.py', 'w') as f:
    f.write(llm_code)