In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from src.web_tasks.page_summary import WebsiteContentSummarizer
from rich.markdown import Markdown
from rich.console import Console
from src.config import OPENAIVARS

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [4]:
openai = OpenAI()
openai_model = OPENAIVARS.open_ai_model

### Surpress Logs

In [None]:
import logging

#Surpresses all logs below critical
logging.getLogger().setLevel(logging.CRITICAL)

# Restores logging
# logging.getLogger().setLevel(logging.DEBUG) 

### Prompts

In [42]:
system_prompt = (
    "You are a highly skilled **business analyst assistant**, specializing in analyzing website content and generating structured, detailed, and insightful business reports.\n\n"

    "**🔹 Your Responsibilities:**\n"
    "- Provide an in-depth **business analysis report** based strictly on the provided website content.\n"
    "- **Extract insights**, rather than simply summarizing the text.\n"
    "- Ignore **navigation elements** (menus, footers, sidebars, repetitive content).\n"
    "- Focus on **core business information** (services, products, mission, vision, customer engagement strategies, competitive positioning, upcoming events, etc.).\n"
    "- Highlight **unique business aspects** and **opportunities for growth**.\n"
    "- Identify **target demographics** who would benefit from the website’s offerings.\n\n"

    "**🔹 Response Format (STRICT Markdown Structure Required):**\n"
    "- The report **must be in Markdown** for readability.\n"
    "- Use **clear headings, bullet points, and structured content**.\n"
    "- If the website provides event details, include a structured table with event names, dates, and target audiences.\n"
    "- Incorporate **quotes or testimonials** from the website (if available) for added credibility.\n\n"

    "**🔹 Important Notes:**\n"
    "- Do not fabricate information—only analyze and interpret what is available.\n"
    "- Use structured Markdown formatting for better readability.\n"
    

)



user_prompt = (

    "**WEBSITE CONTENT PROVIDED BELOW:**\n\n"

    "**🔹 TASK:**\n"
    "Generate a **comprehensive and structured business analysis report** based strictly on the provided website content.\n\n"

    "**🔹 RESPONSE FORMAT REQUIREMENT:**\n"
    "- The report **must be written in Markdown format**.\n"
    "- Use **structured sections, tables (if applicable), and bullet points**.\n"
    "- Ensure clarity, depth, and proper segmentation of insights.\n\n"

    "**🔹 FINAL TOUCH:**\n"
    "Provide **a summary with key takeaways**, ensuring all insights are well-organized and actionable.\n\n"

    "**WEBSITE CONTENT STARTS BELOW:**"
)




links_filter_system_prompt = (
    "You are an **intelligent web assistant**, responsible for **analyzing a list of hyperlinks** extracted from a website.\n"
    "Your task is to filter and retain only the **most relevant and informative links**, ensuring that the extracted content aligns with the business analysis process.\n\n"

    "**🔹 Purpose & Context:**\n"
    "The selected links will be used by the **business analysis assistant** to generate a comprehensive business report.\n"
    "- `system_prompt`: Guides the assistant in summarizing a company's key offerings based on website content.\n"
    "- `user_prompt`: Instructs the assistant to generate an insightful business report focused on services, products, and competitive positioning.\n\n"

    "**🔹 Link Selection Criteria:**\n"
    "Only keep links that provide information crucial to understanding the **business, its offerings, and unique value proposition**"
    "This includes, but is NOT limited to. :\n"
    "- **About Us** (mission, history, brand identity)\n"
    "- **Products & Services** (detailed breakdown of offerings)\n"
    "- **Blog & News** (business insights, industry trends, company updates)\n"
    "- **Case Studies & Testimonials** (customer success stories, credibility indicators)\n"
    "- **Careers** (team culture, hiring insights, employer brand perception)\n\n"

    "**🔹 Exclude the following links:**\n"
    "- **Navigation or duplicate content** (e.g., menus, footers, repeated elements)\n"
    "- **Social media pages** (Facebook, Instagram, Twitter, LinkedIn, YouTube, etc.)\n"
    "- **Legal & policy pages** (Terms of Service, Privacy Policy, Cookie Policy)\n"
    "- **Contact pages that only contain an email form**\n"
    "- **External third-party websites**, unless they are integral to the business (e.g., official partner integrations)\n"
    "- **Generic FAQ pages that do not provide unique insights about the business**\n"
    "- **Sitemap, login pages, or any technical pages unrelated to business insights**\n\n"

    "**Response Format:**\n"
    "Provide a structured JSON response containing only the retained links with their type classification.\n"
    "Example output:\n"
    "{\n"
    "    'links': [\n"
    "        {'type': 'about', 'url': 'https://company.com/about'},\n"
    "        {'type': 'careers', 'url': 'https://company.com/careers'},\n"
    "        {'type': 'blog', 'url': 'https://company.com/blog'}\n"
    "    ]\n"
    "}"
)


links_filter_user_prompt = "Website: {url}\n\nHere is a list of links found on the website:\n" + "\n"

### Single Page Summary

In [43]:
summariser = WebsiteContentSummarizer("https://dirttrackriders.co.uk/about",openai,openai_model, explore_multiple_links=False,verbosity=True)

summary = summariser.summarize((system_prompt,user_prompt))
console = Console()
console.print(Markdown(summary))

### Multiple Page Summary

In [44]:
summariser = WebsiteContentSummarizer("https://dirttrackriders.co.uk",openai,openai_model, explore_multiple_links=True,verbosity=True)

summary = summariser.summarize((system_prompt,user_prompt),(links_filter_system_prompt,links_filter_user_prompt))
console = Console()
console.print(Markdown(summary))