### Install the requaired library

In [None]:
%pip install -q aixplain faiss-cpu pypdf pypdf2 pdfplumber PyPDF2 scikit-learn

In [None]:
%pip install -q aixplain

### Add Your API key

In [None]:
import os

# Set your aiXplain API key
os.environ["AIXPLAIN_API_KEY"] = "Add you api key here"
print(os.environ["AIXPLAIN_API_KEY"])

### Verify that the API key is working correctly

In [None]:
from aixplain.factories import ModelFactory

model = ModelFactory.get("673248d66eb563b2b00f75d1")
res = model.run("hello hello")

In [None]:
import os
import pandas as pd
import PyPDF2
import faiss
import numpy as np
from datetime import datetime

In [None]:
from aixplain.factories import IndexFactory
from aixplain.modules.model.index_model import Splitter, IndexFilter, IndexFilterOperator
from aixplain.modules.model.record import Record
from aixplain.enums.splitting_options import SplittingOptions
from pypdf import PdfReader

# PDF file path
pdf_path = "data/pdfs/Careers-and-Educational-Guidance-Policy-2025-26.pdf"

# Extract text from PDF
reader = PdfReader(pdf_path)
full_text = ""
total_pages = len(reader.pages)

for page_num, page in enumerate(reader.pages):
    page_text = page.extract_text()
    if page_text:
        full_text += f"[PAGE {page_num + 1}]\n{page_text}\n"

print(f"âœ“ Extracted text from {total_pages} pages")
print(f"âœ“ Total characters: {len(full_text):,}")

In [None]:
# Advanced chunking with Splitter - optimized for policy documents
splitter = Splitter(
    split=True,
    split_by=SplittingOptions.SENTENCE,
    split_length=10,      # 10 sentences per chunk (paragraph-level)
    split_overlap=2       # 2 sentence overlap for context continuity
)

print(f"âœ“ Splitter configured for sentence-based chunking")
print(f"  - Split method: SENTENCE")
print(f"  - Chunk size: 10 sentences")
print(f"  - Overlap: 2 sentences")

In [None]:
# Create index with metadata for education policy
try:
    index = IndexFactory.create(
        name="Education Guidance Policy Index",
        description=(
            "Comprehensive educational guidance and policy document "
            "containing federal requirements, guidelines, and best practices "
            "for educational agencies and institutions."
        ),
        embedding_model="678a4f8547f687504744960a"  # Snowflake Arctic

    )
    print(f"âœ“ Index created! ID: {index.id}")
except Exception as e:
    if "already exists" in str(e):
        # If index already exists, retrieve it by known ID
        try:
            index = IndexFactory.get("694285b39dcf6413b67dd5fb")
            print(f"âœ“ Using existing index! ID: {index.id}")
        except Exception as fetch_error:
            print(f"âœ— Failed to retrieve existing index: {fetch_error}")
            raise
    else:
        raise

    print(f"Index created with ID: {index.id}")

In [None]:
# Create records with rich metadata for filtering and citations
records = []

# Extract metadata from PDF
pdf_filename = os.path.basename(pdf_path)
upload_date = datetime.now().strftime("%Y-%m-%d")

for i, chunk_text in enumerate(full_text.split("\n\n")[:50]):  # Limit for demo
    if not chunk_text.strip() or len(chunk_text.strip()) < 100:
        continue
    
    # Extract page number if available
    page_match = chunk_text.split("[PAGE")[0]
    page_num = i // 5 + 1  # Approximate page number
    
    record = Record(
        id=f"chunk_{i}",
        value=chunk_text.strip(),
        value_type="text",
        attributes={
            # Source attribution
            "source_title": "Careers and Educational Guidance Policy 2025-26",
            "source_filename": pdf_filename,
            "source_url": "internal://education-policy",
            
            # Organization
            "doc_type": "policy",
            "category": "education_guidance",
            "section": "policy",
            
            # Tracking
            "chunk_id": i,
            "page_number": page_num,
            "upload_date": upload_date,
            "last_updated": upload_date,
            
            # Searchability
            "priority": "high",
            "tags": ["education", "guidance", "policy", "federal", "career"]
        }
    )
    records.append(record)

print(f"âœ“ Created {len(records)} enriched records with metadata")

In [None]:
# Upload records with intelligent chunking and error handling
batch_size = 5
successful_uploads = 0
failed_batches = []

for i in range(0, len(records), batch_size):
    batch_number = (i // batch_size) + 1
    batch = records[i:i + batch_size]

    try:
        # Upsert with splitter for automatic chunking
        index.upsert(batch, splitter=splitter)
        successful_uploads += len(batch)
        print(
            f"âœ“ Batch {batch_number}: Uploaded {len(batch)} records "
            f"(Total: {successful_uploads}/{len(records)})"
        )
    except Exception as e:
        failed_batches.append((batch_number, str(e)))
        print(f"âœ— Batch {batch_number} failed: {e}")

print(f"\nâœ“ Upload complete: {successful_uploads} records indexed successfully")
if failed_batches:
    print(f"âš  {len(failed_batches)} batches failed")

## Knowledge Base Statistics & Verification

In [None]:
# Verify index and retrieve statistics
doc_count = index.count()
print(f"Total documents in index: {doc_count}")
print(f"Index ID: {index.id}")
print(f"Upload timestamp: {datetime.now().isoformat()}")

## Advanced Search with Filtering & Citations

The search system now supports:
- **Semantic search**: Find documents by meaning, not just keywords
- **Metadata filtering**: Filter by category, priority, tags, dates
- **Source citations**: All results include source attribution
- **Relevance scoring**: See similarity scores (0-1 scale)

In [None]:
# Example 1: Basic semantic search with citations
print("=" * 70)
print("SEARCH EXAMPLE 1: Basic Query with Citations")
print("=" * 70)

response = index.search(
    "federal requirements for educational guidance policies",
    top_k=3
)

print(f"\nFound {len(response.details)} results:\n")
for i, result in enumerate(response.details, 1):
    metadata = result.get('metadata', {})
    source_title = metadata.get('source_title', 'Unknown')
    last_updated = metadata.get('last_updated', 'N/A')
    score = result['score']
    
    print(f"{i}. Relevance Score: {score:.1%}")
    print(f"   Content: {result['data'][:120]}...")
    print(f"   ðŸ“š Source: {source_title}")
    print(f"   ðŸ“… Updated: {last_updated}\n")

## Advanced Filtered Search

Filter searches by category, priority, tags, and date ranges for precise results.

In [None]:
# Example 2: Filtered search - High priority education content
print("=" * 70)
print("SEARCH EXAMPLE 2: Filtered by Priority")
print("=" * 70)

priority_filter = IndexFilter(
    field="priority",
    value="high",
    operator=IndexFilterOperator.EQUALS
)

filtered_response = index.search(
    "student guidance and career pathways",
    filters=[priority_filter],
    top_k=3
)

print(f"\nHigh-priority results about guidance and career:\n")
for i, result in enumerate(filtered_response.details, 1):
    metadata = result.get('metadata', {})
    tags = metadata.get('tags', [])
    
    print(f"{i}. Score: {result['score']:.1%}")
    print(f"   Category: {metadata.get('category', 'N/A')}")
    print(f"   Tags: {', '.join(tags) if tags else 'None'}")
    print(f"   Content: {result['data'][:100]}...\n")

## SQL Tool

### Interact with SQLite databases and CSV files, create and manage tables, execute read/write queries, and print formatted output.

### Key Features
- Automatic CSV-to-SQLite conversion
- Schema inference and validation
- Column name cleaning for SQLite compatibility
- Support for both read-only and write operations
- Comprehensive error handling and validation

In [None]:
from aixplain.modules.agent.output_format import OutputFormat
from aixplain.modules.agent.tool.sql_tool import SQLTool

# Create a SQL tool that works with a CSV
sql_tool = SQLTool(
    name="ESEA Report Card Analyzer",
    description="OPPORTUNITIES AND RESPONSIBILITIES FOR STATE AND LOCAL REPORT CARDS U.S. Department of Education Under the Elementary and Secondary Education Act of 1965",
    database="data/ESEA_Report_Card_Guidelines.db",  # database file
    source_type="csv",                 
    enable_commit=False               
)


## Education Policy Website Scraper

- Scrapes predefined landing pages or selected subpages on ed.gov

- Limits output to first 15 paragraphs per page


In [None]:
def policy_scraper(url: str, max_paragraphs: int = 15, follow_links: bool = True) -> str:
    """
    Education Policy Scraper (1-Level Links)
    - Scrapes key policy text from a predefined education website
    - Follows first-level internal links to get full articles
    - Limits output to `max_paragraphs` per page for efficiency
    - Input: URL of the page to scrape
    - Output: Concatenated text from page and linked articles
    """
    import os
    os.system("pip install -q requests beautifulsoup4 2>/dev/null")
    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin, urlparse

    def scrape_page(page_url: str, paragraphs: int) -> str:
        try:
            print(f"[DEBUG] Scraping: {page_url}")
            response = requests.get(page_url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Try multiple selectors for content
            paras = soup.find_all("p")[:paragraphs]
            
            if not paras:
                # Try finding divs with text content
                paras = soup.find_all("div", class_=["content", "main", "body"])[:paragraphs]
            
            text_content = "\n\n".join(p.get_text(strip=True) for p in paras if p.get_text(strip=True))
            print(f"[DEBUG] Found {len(paras)} elements, {len(text_content)} characters")
            return text_content if text_content else f"Could not extract text from {page_url}"
        except Exception as e:
            error_msg = f"Error fetching {page_url}: {str(e)}"
            print(f"[DEBUG] {error_msg}")
            return error_msg

    print(f"[DEBUG] Starting scraper for: {url}")
    content = scrape_page(url, max_paragraphs)

    if follow_links and content and "Error" not in content:
        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all internal links (same domain)
            base_domain = urlparse(url).netloc
            internal_links = []
            for a in soup.find_all("a", href=True):
                link = urljoin(url, a['href'])
                if urlparse(link).netloc == base_domain and link != url:
                    internal_links.append(link)

            print(f"[DEBUG] Found {len(internal_links)} internal links")
            
            # Scrape first 2 internal links only for efficiency
            for i, link in enumerate(internal_links[:2]):
                print(f"[DEBUG] Scraping linked page {i+1}: {link}")
                content += "\n\n---LINKED CONTENT---\n\n" + scrape_page(link, max_paragraphs)

        except Exception as e:
            print(f"[DEBUG] Error following links: {str(e)}")
            content += f"\n\nError following links: {str(e)}"

    return content if content else "No text found on the page."

In [None]:
from aixplain.factories import ModelFactory

scraper_tool = ModelFactory.create_utility_model(
    name="Education Policy Scraper",
    description=(
        "Scrapes key policy text from education.gov pages. "
        "Follows first-level internal links to include full articles. "
        "Limits to 10 paragraphs per page to save resources."
    ),
    code=policy_scraper
)

In [None]:
# Print all tool IDs for use in app.py
print("=" * 70)
print("TOOL IDs FOR STREAMLIT APP")
print("=" * 70)
print(f"SCRAPER_TOOL_ID = \"{scraper_tool.id}\"")
print(f"INDEX_TOOL_ID = \"{index.id}\"")
# print(f"SQL_TOOL_ID = \"{sql_tool.id}\"")
print("=" * 70)
print("\nCopy these IDs into your app.py file!")

## Creating a knowledge_assistant Agent
Uses both pdf tool and the SQL tool.

In [None]:
from aixplain.factories import AgentFactory

# Create production-ready RAG agent with citation support
knowledge_assistant = AgentFactory.create(
    name="Education Policy Advisor",
    description=(
        "Education Policy Advisor is an AI agent specialized in providing authoritative "
        "guidance on education policies and federal education requirements. "
        "It strictly retrieves information from three sources: "
        "1) PDF Index: CAMBRIDGE SCHOOL OF VISUAL & PERFORMING ARTS - Careers and Higher Education Guidance Policy, "
        "2) ESEA Report Card database with federal education requirements and guidelines, and "
        "3) Web scraping of education.gov and related policy websites. "
        "All responses are structured with headings, bullet points, and paragraphs, and include clear citations. "
        "The agent does not use general knowledge or external sources beyond what is retrieved."
    ),
    llm_id="669a63646eb56306647e1091",  # GPT 4o mini
    instructions="""
**MISSION STATEMENT**
You are an Education Policy Advisor that provides accurate, citation-backed information from approved sources ONLY. Your role is to retrieve, reformat, and present information clearlyâ€”never to add, interpret beyond the source material, or fabricate information.

**CORE OPERATIONAL RULES (NON-NEGOTIABLE)**

**Rule 1: Source Fidelity**
- ONLY use information retrieved from the three approved tools
- NEVER add information from general knowledge, training data, or external sources
- If a tool returns information, use ONLY that information
- DO NOT supplement retrieved information with additional context unless it comes from the tools
- DO NOT make inferences or draw conclusions not explicitly stated in the sources

**Rule 2: Information Accuracy**
- Present retrieved information exactly as found in the source
- You may reformat for clarity (headings, bullets, paragraphs) but NEVER change:
  * Facts, figures, dates, names, or statistics
  * Policy requirements or guidelines
  * Definitions or terminology
  * Procedures or processes
- Paraphrase for readability ONLY when it maintains exact meaning
- When in doubt, quote directly

**Rule 3: Common Knowledge Exception**
- Basic definitions (e.g., "SEA stands for State Education Agency") are acceptable
- Standard formatting explanations (e.g., "This policy applies to...") are acceptable
- DO NOT use this exception to add substantive policy information

**TOOL SELECTION PROTOCOL**

**Step 1: Analyze the Query**
Identify the query type before selecting tools:

**Type A: URL Provided**
- User includes a specific URL in their question
- Action: Use ONLY Web Scraper Tool
- DO NOT use PDF Index or SQL Database

**Type B: Cambridge School / CSVPA Specific**
- Keywords: Cambridge School, CSVPA, careers guidance, higher education guidance, academic transcripts, sixth form
- Action: Use ONLY PDF Index Tool first
- If PDF returns NO relevant results, state this clearly and stop (do not search other sources)

**Type C: ESEA / Federal Database Specific**
- Keywords: ESEA, report card, SEA, LEA, state education agency, local education agency, federal requirements
- Action: Use ONLY SQL Database Tool first
- If SQL returns NO relevant results, state this clearly and stop (do not search other sources)

**Type D: General Education Policy**
- No specific school or database mentioned
- Action: Use PDF Index Tool first
- If no results: Try SQL Database Tool
- If still no results: Inform user information is not available

**Step 2: Execute Tool Call**
- Call ONE tool based on query type
- Wait for results
- If results found: Proceed to formatting response
- If NO results found: Follow "Information Not Found Protocol"

**Step 3: Decision Point**
- Information found in first tool: STOP searching, format response
- No information found: Try next relevant tool OR inform user (see Type B/C rules)

**INFORMATION NOT FOUND PROTOCOL**

**When a tool returns NO relevant information:**

1. State clearly: "I could not find information about [specific topic] in [tool name: e.g., the Cambridge School policy document / the ESEA database / the provided URL]."

2. Check if another tool is appropriate:
   - For Type B (Cambridge) queries: DO NOT search other sources
   - For Type C (ESEA) queries: DO NOT search other sources
   - For Type D (General) queries: Try next relevant tool

3. If no tools have relevant information:
   "I could not find information about [topic] in my available sources:
   - Cambridge School policy documents
   - ESEA Report Card database
   - Education policy websites
   
   I can only provide information available in these specific sources."

4. NEVER say: "Based on general knowledge..." or "Typically..." or "In general education policy..."

**RESPONSE FORMATTING REQUIREMENTS**

**Structure Every Response:**

### [Clear Descriptive Heading]

[Introduction sentence contextualizing the information]

**[Subheading if needed]**
- Bullet point for distinct items
- Bullet point for lists
- Bullet point for requirements

**[Another Subheading if needed]**
1. Numbered list for sequential steps
2. Numbered list for processes
3. Numbered list for prioritized items

[Paragraph format for explanatory text, policy descriptions, or detailed guidance that flows better in prose form.]

#### Sources
- [Exact source citation as specified in Citation Rules]

**Formatting Standards:**
- Use ### for main headings
- Use ** for subheadings
- Use bullet points (-) for non-sequential lists
- Use numbered lists (1., 2., 3.) for sequential steps or processes
- Use **bold** for emphasis on key terms or requirements
- Use paragraphs for flowing explanatory text
- Keep paragraphs to 3-5 sentences maximum for readability
- Use blank lines between sections for visual separation

**Response Length:**
- Provide complete informationâ€”do not truncate relevant details
- If source material is extensive, organize into clear sections with headings
- Prioritize most relevant information first

**CITATION RULES (MANDATORY)**

**Every response MUST end with a Sources section:**

#### Sources

**For PDF Index Tool:**
- Cambridge School of Visual & Performing Arts - Careers and Higher Education Guidance Policy

**For SQL Database Tool:**
- ESEA Report Card Database - [Specify table name or section if available]

**For Web Scraper Tool:**
- [Full URL exactly as provided]

**Multiple Sources:**
If you use multiple tools (rare), list all sources:
#### Sources
- Cambridge School of Visual & Performing Arts - Careers and Higher Education Guidance Policy
- ESEA Report Card Database - Student Performance Data
- https://www.education.gov/policy/example

**SCOPE & BOUNDARIES**

**IN SCOPE - Answer These:**
- Education policies (federal, state, institutional)
- Cambridge School guidance policies
- ESEA requirements and report cards
- Higher education guidance
- Careers guidance in educational settings
- Academic transcript policies
- Federal education regulations
- Content from provided education policy URLs

**OUT OF SCOPE - Politely Decline:**
- Medical advice
- Legal advice (note: you can provide policy information, but not legal interpretation)
- Financial planning or advice
- Personal counseling
- Non-education topics (entertainment, sports, politics unrelated to education)
- Requests to generate creative content (poems, stories, scripts)
- Requests to role-play or pretend to be someone else
- Technical troubleshooting unrelated to education systems

**HANDLING INAPPROPRIATE OR OUT-OF-SCOPE QUERIES**

**For Inappropriate Content:**
"I'm designed to provide information about education policies and guidance. I cannot assist with [topic]. 

I can help you with:
- Education policy questions
- Cambridge School guidance policies
- Federal education requirements (ESEA)
- Higher education and careers guidance information

Please feel free to ask a question within these areas."

**For Off-Topic Questions:**
"That question is outside my area of expertise. I specialize in education policy and guidance, specifically:
- Cambridge School of Visual & Performing Arts policies
- Federal education requirements (ESEA)
- Education policy from official sources

Is there an education policy question I can help you with?"

**For Requests to Add Personal Opinions or Interpretations:**
"I provide factual information from official education policy sources only. I cannot offer personal interpretations or opinions.

I can share what the official policy states on this topic. Would you like me to retrieve that information?"

**For Requests to Make Recommendations:**
"I can provide information about the policies and requirements, but I cannot make personal recommendations about what you should do.

I can help you understand:
- What the policy states
- What the requirements are
- What options are available according to the source documents

Would you like information on any of these aspects?"

**QUALITY ASSURANCE CHECKLIST**

Before sending each response, verify:

â–¡ Information comes ONLY from retrieved tool results
â–¡ No general knowledge or assumptions added
â–¡ Facts, figures, and requirements are exactly as in source
â–¡ Response is well-formatted with appropriate headings and structure
â–¡ Citations are included in Sources section
â–¡ If no information found, user is clearly informed
â–¡ Response is within scope of education policy guidance
â–¡ Tone is professional, neutral, and factual

**TONE & STYLE**

- **Professional**: Use formal, clear language appropriate for policy guidance
- **Neutral**: Present information objectively without bias or opinion
- **Factual**: Stick to verifiable information from sources
- **Helpful**: Organize information to be easily understood
- **Direct**: Get to the point without unnecessary preamble
- **Respectful**: Treat all queries with professionalism

**Avoid:**
- Casual language or slang
- Emojis or excessive punctuation
- Personal pronouns referring to yourself excessively (minimize "I think", "I believe")
- Hedging when information is clear in the source
- Over-apologizing


**FINAL REMINDERS**

1. You are a RETRIEVAL and FORMATTING agent, not a knowledge generation agent
2. When in doubt, retrieve from sourcesâ€”never assume or add information
3. One well-selected tool is better than using all tools unnecessarily
4. Clear communication about limitations builds trust
5. Citations are mandatoryâ€”never skip the Sources section
6. Professional boundaries protect both you and the user

Your success is measured by:
- Accuracy of information
- Proper source attribution
- Clear, organized presentation
- Staying within scope
- Never adding unsourced information
""",
    tools=[sql_tool, scraper_tool, AgentFactory.create_model_tool("694285b39dcf6413b67dd5fb")]
)

print("âœ“ Production agent created with bulletproof instructions")
try:
    knowledge_assistant.deploy()
    print(f"âœ“ Agent deployed successfully with ID: {knowledge_assistant.id}")
except Exception as e:
    print(f"âš  Agent deployment skipped: {e}")

In [None]:
knowledge_assistant.id

## Query the database

In [None]:
response = knowledge_assistant.run(
    query="What are the responsibilities of an SEA and an LEA for preparing a report card?",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

## Agent Testing PDF Tool

In [None]:
response = knowledge_assistant.run(
    query="What are the federal requirements for educational guidance programs?",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

### Testing the Scrapping Tool

In [None]:
response = knowledge_assistant.run(
    query= "According to this URL What is the Every Student Succeeds Act?: https://www.ed.gov/laws-and-policy/laws-preschool-grade-12-education/esea/what-is-the-every-student-succeeds-act ",
    output_format=OutputFormat.MARKDOWN
)
print(response.data.output)

# Extract tool usage
intermediate_steps = getattr(response.data, "intermediate_steps", None)
if intermediate_steps:
    print("\n[Search Details]")
    for step in intermediate_steps:
        tool_steps = step.get("tool_steps")
        if tool_steps:
            for tool_step in tool_steps:
                print(f"- Query: {tool_step.get('input', 'N/A')}")

## Agent Deployment

In [None]:
try:
    deployment = knowledge_assistant.deploy()
    print(f"knowledge_assistant deployed successfully with ID: {knowledge_assistant.id}")
except Exception as e:
    print(f"Deployment failed: {e}")