In [1]:
!pip install langchain beautifulsoup4 langchain_community chromadb

Collecting langchain_community
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from 

In [2]:
from google.colab import userdata
import huggingface_hub

hf_token = userdata.get('HF_TOKEN')

huggingface_hub.login(hf_token)

In [3]:
!mkdir web_pages

In [4]:
import os
import re
import requests
from bs4 import BeautifulSoup

urls = [
    "https://canonical.com/solutions/ai",
    "https://canonical.com/data"
]

headers = {
    "Accept": "application/json",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def sanitize_filename(name: str) -> str:
    """Remove invalid characters and format for saving."""
    name = re.sub(r"[\\/*?\"<>|:]", "", name)
    name = name.replace(" ", "_")
    return name.strip() or "untitled_page"

root_data_path = "web_pages"

for url in urls:
    print(f"Fetching {url}...")
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    # Get title
    title_tag = soup.find("title")
    title = title_tag.get_text(strip=True) if title_tag else "Untitled Page"
    safe_title = sanitize_filename(title)
    filename = f"{root_data_path}/{safe_title}.txt"

    # Find main content
    soup_main = soup.find("div", id="main-content")

    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n")
        f.write(f"Title: {title}\n\n")

        if not soup_main:
            f.write("No main-content div found.\n")
            print(f"⚠️  No main-content found for {url}")
            continue

        sections = soup_main.find_all("section", class_="p-section")
        if not sections:
            f.write("No p-section sections found.\n")
            print(f"⚠️  No p-section found for {url}")
            continue

        # Write all text sections
        for idx, section in enumerate(sections, start=1):
            text = section.get_text(" ", strip=True)
            f.write(f"\n{text}\n\n")

    print(f"✅ Saved: {filename}")

Fetching https://canonical.com/solutions/ai...
✅ Saved: web_pages/Open_source_AI_for_the_enterprise.txt
Fetching https://canonical.com/data...
✅ Saved: web_pages/Data_Solutions.txt


In [5]:
import os
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

DATA_DIR = "./web_pages"
CHROMA_DIR = "./chroma_store"

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Collect all text files
documents = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)
chunk_count = 0

print(f"Loading documents from {DATA_DIR}...")

for file_name in os.listdir(DATA_DIR):
    if file_name.endswith(".txt"):
        file_path = os.path.join(DATA_DIR, file_name)

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read().strip()

            if content:
                # Split the content into chunks
                chunks = text_splitter.split_text(content)

                # Create Document objects with metadata
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={
                            "source": file_name,
                            "chunk_id": i,
                            "total_chunks": len(chunks)
                        }
                    )
                    documents.append(doc)
                    chunk_count += 1

                print(f"{file_name}: {len(chunks)} chunks")

        except Exception as e:
            print(f"Error loading {file_name}: {e}")

# Create or load Chroma index
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=CHROMA_DIR
)

vectorstore.persist()
print(f"✅ Chroma vector store created and persisted at: {CHROMA_DIR}")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading documents from ./web_pages...
Data_Solutions.txt: 5 chunks
Open_source_AI_for_the_enterprise.txt: 14 chunks
✅ Chroma vector store created and persisted at: ./chroma_store


  vectorstore.persist()


In [6]:
import os
import torch
import requests
import transformers
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# ========================
# CONFIGURATION
# ========================
CHROMA_DIR = "./chroma_store"
TOMORROW_API_KEY = "your key here"
WEATHER_URL = "https://api.openweathermap.org/data/2.5/weather"

# ========================
# LOAD VECTOR DB
# ========================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma(
    persist_directory=CHROMA_DIR,
    embedding_function=embedding_model
)

# ========================
# MODEL + TOKENIZER
# ========================
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

  vectorstore = Chroma(


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
from typing import Optional

def get_weather(city: str):
    """Fetch live weather information using Tomorrow.io Realtime API."""
    try:
        url = f"https://api.tomorrow.io/v4/weather/realtime?location={city}&apikey={TOMORROW_API_KEY}"

        headers = {
            "accept": "application/json",
            "accept-encoding": "deflate, gzip, br"
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        data = response.json()
        values = data["data"]["values"]
        temp = values.get("temperature")
        weather_code = values.get("weatherCode")

        # Map weather code to description
        description = get_weather_description(weather_code) if weather_code else "unknown conditions"

        print(f"Weather fetched for {city}: {temp}°C")
        return f"The weather in {city} is currently {description} with a temperature of {temp}°C."

    except requests.RequestException as e:
        return f"Error fetching weather data: {e}"
    except KeyError as e:
        return f"Error: unexpected response format from weather API. Missing: {e}"
    except Exception as e:
        return f"Unexpected error: {e}"


def get_weather_description(code: int) -> str:
    """Map Tomorrow.io weather codes to human-readable descriptions."""
    weather_codes = {
        0: "Unknown",
        1000: "Clear, Sunny",
        1100: "Mostly Clear",
        1101: "Partly Cloudy",
        1102: "Mostly Cloudy",
        1001: "Cloudy",
        2000: "Fog",
        2100: "Light Fog",
        4000: "Drizzle",
        4001: "Rain",
        4200: "Light Rain",
        4201: "Heavy Rain",
        5000: "Snow",
        5001: "Flurries",
        5100: "Light Snow",
        5101: "Heavy Snow",
        6000: "Freezing Drizzle",
        6001: "Freezing Rain",
        6200: "Light Freezing Rain",
        6201: "Heavy Freezing Rain",
        7000: "Ice Pellets",
        7101: "Heavy Ice Pellets",
        7102: "Light Ice Pellets",
        8000: "Thunderstorm"
    }
    return weather_codes.get(code, f"Weather code {code}")

def agent_respond(user_query: str, vectorstore: Optional[object] = None) -> str:
    """
    Combines vector DB and live data context, then generates response.
    """
    context = ""

    # Retrieve relevant docs from vector store
    if vectorstore:
        try:
            results = vectorstore.similarity_search(user_query, k=5)
            if results:
                context += "\n\n--- Retrieved Knowledge ---\n"
                for doc in results:
                    context += f"{doc.page_content}\n"
        except Exception as e:
            print(f"⚠️ Vector store error: {e}")
    print(f"context >>>>>> {context}")
    # Check for weather-related intent
    weather_info = ""
    if "weather" in user_query.lower():
        detected_city = None
        cities = ["Bangalore", "Delhi", "London", "New York", "Tokyo", "Toronto", "Mumbai", "Paris"]

        for city in cities:
            if city.lower() in user_query.lower():
                detected_city = city
                break

        if detected_city:
            print(f"Fetching weather for {detected_city}...")
            weather_info = get_weather(detected_city)
        else:
            weather_info = "You asked about the weather, but I couldn't detect a specific city name."

    # Build the prompt components
    system_prompt = """You are a helpful assistant. Follow these rules:
    1. Judge from the prompt if question can be answered with information from knowledge base or weather information or both. Below are some examples for your reference.
    Example:
    Context: "Canonical offers data solutions including PostgreSQL, MySQL, and MongoDB."
    Question: "What databases does Canonical support and how is the weather in london?"
    Answer: "Based on the information provided, Canonical offers support for several databases including PostgreSQL, MySQL, and MongoDB as part of their data solutions. The Weather fetched for Delhi is 28.6°C"

    Now answer the user's question using ONLY the context provided below. If no relevant context exists, say so."""

    # user_content = f"""Question: {user_query}
    # {f"Context from knowledge base:\n{context}" if context else ""}
    # {f"Current weather information:\n{weather_info}" if weather_info else ""}
    # IMPORTANT: Do not make up information or use context from somwhere but the provided context."""
    user_content = f"""
    Question: {user_query}

    Below information contains retrieved knoweldge and weather information, summarize from this only:
    {f"Context from knowledge base:\n{context}" if context else ""}
    {f"Current weather information:\n{weather_info}" if weather_info else ""}

    If relevant information is missing, respond: "Not found in retrieved knowledge."
    """


    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user_content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    print(f"\nGenerating response...")

    # Generate response
    output = pipeline(
      prompt,
      max_new_tokens=300,
      do_sample=False,
      temperature=None,
      top_p=None,
      repetition_penalty=1.1,
      pad_token_id=tokenizer.eos_token_id,
  )

    # DEBUG: Print raw output
    # print("RAW OUTPUT LENGTH:", len(output[0]["generated_text"]))
    # print("PROMPT LENGTH:", len(prompt))
    # print("NEW TOKENS GENERATED:", len(output[0]["generated_text"]) - len(prompt))

    # Extract only the new generated text
    full_output = output[0]["generated_text"]
    response_text = full_output[len(prompt):].strip()

    # Remove any end tokens and cleanup
    response_text = response_text.split("<|eot_id|>")[0].strip()
    response_text = response_text.split("<|end_of_text|>")[0].strip()

    return response_text

In [None]:
if __name__ == "__main__":
    print("\n" + "=" * 60)
    print("🤖 LLAMA-3.2 AGENT WITH WEATHER INTEGRATION")
    print("=" * 60)
    while True:
      user_query = input("You: ").strip()
      if user_query.lower() in ["exit", "quit"]:
          print("👋 Exiting chat. Goodbye!")
          break
      response = agent_respond(user_query, vectorstore)
      print(f"\n🧠 Agent response: {response}")

In [8]:
agent_respond("tell me abut canonical offerings", vectorstore)

context >>>>>> 

--- Retrieved Knowledge ---
URL: https://canonical.com/data
Title: Data Solutions
URL: https://canonical.com/solutions/ai
Title: Open source AI for the enterprise


Why Canonical for enterprise AI? Run your entire AI/ML lifecycle on a single integrated stack Develop at all scales with the same software provider Control your TCO with predictable costs Get maintained and supported open source AI software
Use cases Canonical offers you the building blocks so you can innovate at your own pace.
          From getting started with data science on Ubuntu workstations to scaling your big data analytics with supported database and MLOps software. All on open source. Confidential AI Data science AI infrastructure Data management MLOps GenAI with RAG Big data analytics Edge AI
. Check out how we enable open source in the world's leading silicon Explore how Canonical partners with silicon vendors to optimize our solutions with certified hardware. MicroK8s with Charmed Kubeflow on 

'Based on the retrieved knowledge, Canonical offers data solutions including:\n\n* PostgreSQL\n* MySQL\n* MongoDB'

In [13]:
agent_respond("tell me abut canonical offerings and how is the weather in delhi", vectorstore)

context >>>>>> 

--- Retrieved Knowledge ---
URL: https://canonical.com/data
Title: Data Solutions
A complete solution for modern data Up to 10 years of support Including security maintenance for critical and high severity issues across the full stack. Get up to 10 years of break-fix support on selected releases with weekday or 24/7 coverage. Fully integrated system One vendor for all your data needs. A complete data solutions portfolio founded on enterprise-grade Ubuntu Server 22.04 LTS, with Ubuntu Pro's compliance and security features
AI resources What are large language models (LLMs)? LLMs and generative AI are dominating much of the current AI/ML discourse, and their potential goes far beyond chatbots. Our blog breaks down LLM use cases, challenges and best practices. GenAI with vector databases and RAG Go deeper on GenAI in this webinar explaining how to enhance your model outputs with RAG
Use cases Canonical offers you the building blocks so you can innovate at your own pace.
 

"Based on the provided context, here's the summary:\n\nCanonical offers data solutions including PostgreSQL, MySQL, and MongoDB. Not found in retrieved knowledge. \nThe weather in Delhi is currently Clear, Sunny with a temperature of 27.9°C."