### Install the requaired library

In [None]:
%pip install -q aixplain faiss-cpu pypdf pypdf2 pdfplumber PyPDF2 scikit-learn

In [None]:
%pip install -q aixplain

### Add Your API key

In [None]:
import os

# Set your aiXplain API key
os.environ["AIXPLAIN_API_KEY"] = "please replace_with_your_actual_api_key"
print(os.environ["AIXPLAIN_API_KEY"])

### Verify that the API key is working correctly

In [None]:
from aixplain.factories import ModelFactory

model = ModelFactory.get("673248d66eb563b2b00f75d1")
res = model.run("hello hello")

In [None]:
import os
import pandas as pd
import PyPDF2
import faiss
import numpy as np
from datetime import datetime

In [None]:
from aixplain.factories import IndexFactory
from aixplain.modules.model.index_model import Splitter, IndexFilter, IndexFilterOperator
from aixplain.modules.model.record import Record
from aixplain.enums.splitting_options import SplittingOptions
from pypdf import PdfReader

# PDF file path
pdf_path = "data/pdfs/Careers-and-Educational-Guidance-Policy-2025-26.pdf"

# Extract text from PDF
reader = PdfReader(pdf_path)
full_text = ""
total_pages = len(reader.pages)

for page_num, page in enumerate(reader.pages):
    page_text = page.extract_text()
    if page_text:
        full_text += f"[PAGE {page_num + 1}]\n{page_text}\n"

print(f"âœ“ Extracted text from {total_pages} pages")
print(f"âœ“ Total characters: {len(full_text):,}")

Configure the text splitter and process the PDF documents. This prepares the data for the RAG index.

In [None]:
# Advanced chunking with Splitter - optimized for policy documents
splitter = Splitter(
    split=True,
    split_by=SplittingOptions.SENTENCE,
    split_length=10,      # 10 sentences per chunk (paragraph-level)
    split_overlap=2       # 2 sentence overlap for context continuity
)

print(f"âœ“ Splitter configured for sentence-based chunking")
print(f"  - Split method: SENTENCE")
print(f"  - Chunk size: 10 sentences")
print(f"  - Overlap: 2 sentences")

In [None]:
# Create index with metadata for education policy
try:
    index = IndexFactory.create(
        name="Education Guidance Policy Index",
        description=(
            "Comprehensive educational guidance and policy document "
            "containing federal requirements, guidelines, and best practices "
            "for educational agencies and institutions."
        ),
        embedding_model="678a4f8547f687504744960a"  # Snowflake Arctic

    )
    print(f"âœ“ Index created! ID: {index.id}")
except Exception as e:
    if "already exists" in str(e):
        # If index already exists, retrieve it by known ID
        try:
            index = IndexFactory.get("694285b39dcf6413b67dd5fb")
            print(f"âœ“ Using existing index! ID: {index.id}")
        except Exception as fetch_error:
            print(f"âœ— Failed to retrieve existing index: {fetch_error}")
            raise
    else:
        raise

    print(f"Index created with ID: {index.id}")

In [None]:
# Create records with rich metadata for filtering and citations
records = []

# Extract metadata from PDF
pdf_filename = os.path.basename(pdf_path)
upload_date = datetime.now().strftime("%Y-%m-%d")

for i, chunk_text in enumerate(full_text.split("\n\n")[:50]):  # Limit for demo
    if not chunk_text.strip() or len(chunk_text.strip()) < 100:
        continue
    
    # Extract page number if available
    page_match = chunk_text.split("[PAGE")[0]
    page_num = i // 5 + 1  # Approximate page number
    
    record = Record(
        id=f"chunk_{i}",
        value=chunk_text.strip(),
        value_type="text",
        attributes={
            # Source attribution
            "source_title": "Careers and Educational Guidance Policy 2025-26",
            "source_filename": pdf_filename,
            "source_url": "internal://education-policy",
            
            # Organization
            "doc_type": "policy",
            "category": "education_guidance",
            "section": "policy",
            
            # Tracking
            "chunk_id": i,
            "page_number": page_num,
            "upload_date": upload_date,
            "last_updated": upload_date,
            
            # Searchability
            "priority": "high",
            "tags": ["education", "guidance", "policy", "federal", "career"]
        }
    )
    records.append(record)

print(f"âœ“ Created {len(records)} enriched records with metadata")

In [None]:
# Upload records with intelligent chunking and error handling
batch_size = 5
successful_uploads = 0
failed_batches = []

for i in range(0, len(records), batch_size):
    batch_number = (i // batch_size) + 1
    batch = records[i:i + batch_size]

    try:
        # Upsert with splitter for automatic chunking
        index.upsert(batch, splitter=splitter)
        successful_uploads += len(batch)
        print(
            f"âœ“ Batch {batch_number}: Uploaded {len(batch)} records "
            f"(Total: {successful_uploads}/{len(records)})"
        )
    except Exception as e:
        failed_batches.append((batch_number, str(e)))
        print(f"âœ— Batch {batch_number} failed: {e}")

print(f"\nâœ“ Upload complete: {successful_uploads} records indexed successfully")
if failed_batches:
    print(f"âš  {len(failed_batches)} batches failed")

## Knowledge Base Statistics & Verification

In [None]:
# Verify index and retrieve statistics
doc_count = index.count()
print(f"Total documents in index: {doc_count}")
print(f"Index ID: {index.id}")
print(f"Upload timestamp: {datetime.now().isoformat()}")

## Advanced Search with Filtering & Citations

The search system now supports:
- **Semantic search**: Find documents by meaning, not just keywords
- **Metadata filtering**: Filter by category, priority, tags, dates
- **Source citations**: All results include source attribution
- **Relevance scoring**: See similarity scores (0-1 scale)

In [None]:
# Example 1: Basic semantic search with citations
print("=" * 70)
print("SEARCH EXAMPLE 1: Basic Query with Citations")
print("=" * 70)

response = index.search(
    "federal requirements for educational guidance policies",
    top_k=3
)

print(f"\nFound {len(response.details)} results:\n")
for i, result in enumerate(response.details, 1):
    metadata = result.get('metadata', {})
    source_title = metadata.get('source_title', 'Unknown')
    last_updated = metadata.get('last_updated', 'N/A')
    score = result['score']
    
    print(f"{i}. Relevance Score: {score:.1%}") 
    print(f"   Content: {result['data'][:120]}...")
    print(f"   ðŸ“š Source: {source_title}")
    print(f"   ðŸ“… Updated: {last_updated}\n")

## Advanced Filtered Search

Filter searches by category, priority, tags, and date ranges for precise results.

In [None]:
# Example 2: Filtered search - High priority education content
print("=" * 70)
print("SEARCH EXAMPLE 2: Filtered by Priority")
print("=" * 70)

priority_filter = IndexFilter(
    field="priority",
    value="high",
    operator=IndexFilterOperator.EQUALS
)

filtered_response = index.search(
    "student guidance and career pathways",
    filters=[priority_filter],
    top_k=3
)

print(f"\nHigh-priority results about guidance and career:\n")
for i, result in enumerate(filtered_response.details, 1):
    metadata = result.get('metadata', {})
    tags = metadata.get('tags', [])
    
    print(f"{i}. Score: {result['score']:.1%}")
    print(f"   Category: {metadata.get('category', 'N/A')}")
    print(f"   Tags: {', '.join(tags) if tags else 'None'}")
    print(f"   Content: {result['data'][:100]}...\n")

## SQL Tool

### Interact with SQLite databases and CSV files, create and manage tables, execute read/write queries, and print formatted output.

### Key Features
- Automatic CSV-to-SQLite conversion
- Schema inference and validation
- Column name cleaning for SQLite compatibility
- Support for both read-only and write operations
- Comprehensive error handling and validation

In [None]:
from aixplain.modules.agent.output_format import OutputFormat
from aixplain.modules.agent.tool.sql_tool import SQLTool

# Create a SQL tool that works with a CSV
sql_tool = SQLTool(
    name="ESEA Report Card Analyzer",
    description="OPPORTUNITIES AND RESPONSIBILITIES FOR STATE AND LOCAL REPORT CARDS U.S. Department of Education Under the Elementary and Secondary Education Act of 1965",
    database="data/ESEA_Report_Card_Guidelines.db",  # database file
    source_type="csv",                 
    enable_commit=False               
)


## Education Policy Website Scraper

- Scrapes predefined landing pages or selected subpages on ed.gov

- Limits output to first 15 paragraphs per page


In [None]:
def policy_scraper(url: str, max_paragraphs: int = 15, follow_links: bool = True) -> str:
    """
    Education Policy Scraper (1-Level Links)
    - Scrapes key policy text from a predefined education website
    - Follows first-level internal links to get full articles
    - Limits output to `max_paragraphs` per page for efficiency
    - Input: URL of the page to scrape
    - Output: Concatenated text from page and linked articles
    """
    import os
    os.system("pip install -q requests beautifulsoup4 2>/dev/null")
    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin, urlparse

    # Define headers to mimic a browser and avoid 403 Forbidden errors
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    def scrape_page(page_url: str, paragraphs: int) -> str:
        try:
            print(f"[DEBUG] Scraping: {page_url}")
            response = requests.get(page_url, headers=HEADERS, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Strategy 1: Look for main content area to avoid nav/footer noise
            main_content = soup.find("main") or soup.find("article") or soup.find("div", role="main") or soup.find("div", id="content")
            
            if main_content:
                search_area = main_content
            else:
                search_area = soup.body or soup

            # Strategy 2: Extract text from paragraphs and list items
            text_elements = []
            
            # Get paragraphs
            text_elements.extend(search_area.find_all("p"))
            
            # Get list items (often used for policy points)
            text_elements.extend(search_area.find_all("li"))
            
            # Get divs with specific classes if no paragraphs found
            if not text_elements:
                 text_elements.extend(search_area.find_all("div", class_=["content", "field-item", "body"]))

            # Filter and clean text
            cleaned_texts = []
            for el in text_elements:
                text = el.get_text(strip=True)
                # Filter out short navigation items or empty strings
                if len(text) > 30: 
                    cleaned_texts.append(text)
            
            # Limit to requested number of paragraphs/items
            final_text = "\n\n".join(cleaned_texts[:paragraphs])
            
            print(f"[DEBUG] Found {len(cleaned_texts)} text blocks, {len(final_text)} characters")
            return final_text if final_text else f"Could not extract text from {page_url}"
        except Exception as e:
            error_msg = f"Error fetching {page_url}: {str(e)}"
            print(f"[DEBUG] {error_msg}")
            return error_msg

    print(f"[DEBUG] Starting scraper for: {url}")
    content = scrape_page(url, max_paragraphs)

    if follow_links and content and "Error" not in content:
        try:
            response = requests.get(url, headers=HEADERS, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all internal links (same domain)
            base_domain = urlparse(url).netloc
            internal_links = []
            
            # Search in main content if possible
            main_content = soup.find("main") or soup.find("article") or soup.body
            
            if main_content:
                for a in main_content.find_all("a", href=True):
                    link = urljoin(url, a['href'])
                    if urlparse(link).netloc == base_domain and link != url:
                        internal_links.append(link)

            print(f"[DEBUG] Found {len(internal_links)} internal links")
            
            # Scrape first 2 internal links only for efficiency
            for i, link in enumerate(internal_links[:2]):
                print(f"[DEBUG] Scraping linked page {i+1}: {link}")
                content += "\n\n---LINKED CONTENT---\n\n" + scrape_page(link, max_paragraphs)

        except Exception as e:
            print(f"[DEBUG] Error following links: {str(e)}")
            content += f"\n\nError following links: {str(e)}"

    return content if content else "No text found on the page."

In [None]:
# TEST THE SCRAPER LOCALLY

test_url = "https://www.ed.gov/laws-and-policy/laws-preschool-grade-12-education/esea/what-is-the-every-student-succeeds-act"
print(f"Testing scraper on: {test_url}\n")

# Call the function directly (not through the agent) to see DEBUG logs
result = policy_scraper(test_url, max_paragraphs=5, follow_links=False)

print("\n" + "="*50)
print("FINAL EXTRACTED CONTENT:")
print("="*50)
print(result)

In [None]:
from aixplain.factories import ModelFactory

scraper_tool = ModelFactory.create_utility_model(
    name="Education Policy Scraper",
    description=(
        "Scrapes key policy text from education.gov pages. "
        "Follows first-level internal links to include full articles. "
        "Limits to 10 paragraphs per page to save resources."
    ),
    code=policy_scraper
)

tools allow the agent to Save, List, and Read notes.

In [None]:
def save_note(content: str, topic: str = "General") -> str:
    """
    Saves a summary note or key information to a local file for the user.
    Use this when the user asks to "save this", "remind me", "create a note", or "keep this information".
    """
    import os
    from datetime import datetime
    
    # Create notes directory if it doesn't exist
    os.makedirs("user_notes", exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"user_notes/note_{timestamp}.txt"
    
    note_content = f"TOPIC: {topic}\nDATE: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n{content}\n"
    
    try:
        with open(filename, "w", encoding="utf-8") as f:
            f.write(note_content)
        return f"Successfully saved note to {filename}"
    except Exception as e:
        return f"Error saving note: {str(e)}"

note_saver_tool = ModelFactory.create_utility_model(
    name="Note Saver",
    description="Saves text content to a local file. Use when user wants to save a summary, reminder, or specific information.",
    code=save_note
)
print(f"âœ“ Note Saver tool created with ID: {note_saver_tool.id}")

In [None]:
def list_notes(query: str = "all") -> str:
    """
    Lists all saved notes in the user_notes directory.
    Use this when the user asks to "see my notes", "list notes", "show my reminders", or "what have I saved".
    """
    import os
    
    # Create directory if it doesn't exist, so we don't return an error
    if not os.path.exists("user_notes"):
        os.makedirs("user_notes", exist_ok=True)
        return "No notes found (Directory was empty)."
    
    files = os.listdir("user_notes")
    if not files:
        return "No notes found."
    
    # Sort by time (newest first)
    files.sort(reverse=True)
    
    file_list = "\n".join([f"- {f}" for f in files])
    return f"Found the following notes (Copy the exact filename to read one):\n{file_list}"

def read_note(filename: str) -> str:
    """
    Reads the content of a specific note file.
    Use this when the user asks to "read note X".
    IMPORTANT: The filename must be EXACTLY as shown in the list_notes output.
    """
    import os
    
    # Clean filename
    filename = filename.strip()
    
    # Security check
    if ".." in filename or "/" in filename:
        return "Error: Invalid filename. Do not use paths."
        
    filepath = f"user_notes/{filename}"
    
    if not os.path.exists(filepath):
        return f"STOP: Note '{filename}' not found. Please ask the user to list notes first to get the correct name."
        
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
        return content
    except Exception as e:
        return f"Error reading note: {str(e)}"

list_notes_tool = ModelFactory.create_utility_model(
    name="List Notes",
    description="Lists all files in the user_notes directory.",
    code=list_notes
)

read_note_tool = ModelFactory.create_utility_model(
    name="Read Note",
    description="Reads the content of a specific note file.",
    code=read_note
)

print(f"âœ“ List Notes tool created with ID: {list_notes_tool.id}")
print(f"âœ“ Read Note tool created with ID: {read_note_tool.id}")

In [None]:
# Print all tool IDs for use in app.py
print("=" * 70)
print("TOOL IDs FOR STREAMLIT APP")
print("=" * 70)
print(f"SCRAPER_TOOL_ID = \"{scraper_tool.id}\"")
print(f"INDEX_TOOL_ID = \"{index.id}\"")
# print(f"SQL_TOOL_ID = \"{sql_tool.id}\"")
print("=" * 70)
print("\nCopy these IDs into your app.py file!")

## Creating a knowledge_assistant Agent
Uses both pdf tool and the SQL tool.

Here we assemble the agent. We define its persona, instructions, and give it access to all the tools we created (Index, SQL, Scraper, Notes).

In [None]:
from aixplain.factories import AgentFactory

# Create production-ready RAG agent with citation support
knowledge_assistant = AgentFactory.create(
    name="Education Policy Advisor",
    description=(
        "Education Policy Advisor is an AI agent specialized in providing authoritative "
        "guidance on education policies and federal education requirements. "
        "It retrieves information from PDF Index, ESEA Report Card database, and Web scraping. "
        "It can also save, list, and read notes for the user upon request. "
    ),
    llm_id="669a63646eb56306647e1091",  # GPT 4o mini
    instructions="""
**MISSION STATEMENT**
You are an Education Policy Advisor. Your goal is to provide helpful, accurate information about education policies, federal requirements, and guidance. You should prioritize information from your provided tools (PDF Index, SQL Database, Web Scraper, Notes).

**OPERATIONAL GUIDELINES**

**1. Source Usage**
- **Primary Sources:** Always check your tools (PDF, SQL, Scraper) first for specific policy details.
- **Context:** You may use your general knowledge to explain terms, provide background context, or summarize findings to make the answer more helpful.
- **Synthesis:** You are encouraged to synthesize information from multiple tools if needed to answer a complex question.
- **URL Constraint:** If a URL is provided in the user query, you must ONLY provide information derived from that URL using the Web Scraper Tool. Do not include information from other sources unless explicitly asked.

**2. Tool Selection Strategy**
- **URL Provided:** Use the **Web Scraper Tool**.
- **Cambridge/CSVPA/General Policy:** Start with the **PDF Index Tool**.
- **ESEA/Federal Requirements:** Start with the **SQL Database Tool**.
- **Notes:** Use **Note Saver**, **List Notes**, or **Read Note** as requested.
- **Fallback:** If your first choice tool doesn't yield results, **please try other relevant tools** before giving up. Do not restrict yourself to just one source if the answer might be elsewhere.

**3. Handling "Not Found"**
- If you cannot find the information in any tool, state clearly: "I couldn't find specific details in the provided documents."
- You may then offer general information based on your training if it helps the user, but clearly distinguish it from the official policy documents.

**RESPONSE FORMATTING**

Please structure your responses clearly:
### [Heading]
[Content]
- Bullet points for lists

#### Sources
- [List the tools or documents used]

**SCOPE**
- Education policies, guidance, and requirements.
- Managing user notes.

**TONE**
- Professional, helpful, and direct.
""",
    tools=[sql_tool, scraper_tool, note_saver_tool, list_notes_tool, read_note_tool, AgentFactory.create_model_tool("694285b39dcf6413b67dd5fb")]
)

print("âœ“ Production agent created with updated instructions")
try:
    knowledge_assistant.deploy()
    print(f"âœ“ Agent deployed successfully with ID: {knowledge_assistant.id}")
except Exception as e:
    print(f"âš  Agent deployment skipped: {e}")

In [None]:
knowledge_assistant.id

## Query the database

In [None]:
response = knowledge_assistant.run(
    query="What are the responsibilities of an SEA and an LEA for preparing a report card?",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

## Agent Testing PDF Tool

In [None]:
response = knowledge_assistant.run(
    query="What are the federal requirements for educational guidance programs?",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

### Testing the Scrapping Tool

In [None]:
response = knowledge_assistant.run(
    query= "According to this URL What is the Every Student Succeeds Act?: https://www.ed.gov/laws-and-policy/laws-preschool-grade-12-education/esea/what-is-the-every-student-succeeds-act ",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

## Agent Deployment

In [None]:
try:
    deployment = knowledge_assistant.deploy()
    print(f"knowledge_assistant deployed successfully with ID: {knowledge_assistant.id}")
except Exception as e:
    print(f"Deployment failed: {e}")

In [None]:
# AUTOMATICALLY UPDATE CONFIG.PY
# Run this cell to update your backend configuration with the new Agent ID
def update_config_file(new_agent_id):
    config_path = "config.py"
    try:
        with open(config_path, "r") as f:
            lines = f.readlines()
        
        with open(config_path, "w") as f:
            for line in lines:
                if line.startswith("AGENT_ID ="):
                    f.write(f'AGENT_ID = "{new_agent_id}"\n')
                else:
                    f.write(line)
        print(f"âœ“ Successfully updated config.py with Agent ID: {new_agent_id}")
    except Exception as e:
        print(f"âœ— Failed to update config.py: {e}")

if 'knowledge_assistant' in locals():
    update_config_file(knowledge_assistant.id)
else:
    print("âš  Agent 'knowledge_assistant' not found. Did you run the agent creation cell?")