# Extracting the links from website

In [11]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# excluded extension types
EXCLUDED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".zip", ".exe", ".mp4", ".mp3"}


class WebScraper:
    """
    A utility class to scrape a website and extract only internal valid links.
    """

    def __init__(self, url):
        self.url = url
        self.base_domain = urlparse(url).netloc  
        self.links = []
        self.scrape_website()

    def scrape_website(self):
        """
        Fetches the webpage, parses it, and extracts only internal links.
        """
        try:
            response = requests.get(self.url, headers=HEADERS, verify=False)
            response.raise_for_status()  
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract all <a> tags with href attribute
            raw_links = [link.get('href') for link in soup.find_all('a', href=True)]
            self.links = self.clean_links(raw_links)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {self.url}: {e}")

    def clean_links(self, links):
        """
        Cleans and filters extracted links:
        - Converts relative links to absolute
        - Removes fragments (#) and unwanted patterns
        - Filters out links that don't belong to the same base domain
        - Excludes links ending with undesired extensions
        """
        valid_links = []
        for link in links:
            # relative URL to absolute
            absolute_url = urljoin(self.url, link)
            
            parsed_url = urlparse(absolute_url)

            # Ensures link belongs to base domain
            if parsed_url.netloc and parsed_url.netloc != self.base_domain:
                continue

            if absolute_url in ("#", "/", ".") or link.startswith(("#", "/", ".")):
                continue

            if any(parsed_url.path.lower().endswith(ext) for ext in EXCLUDED_EXTENSIONS):
                continue

            if not absolute_url.startswith(("http://", "https://")):
                continue 

            valid_links.append(absolute_url)

        return list(set(valid_links))  # Remove duplicates

    def get_links(self):
        'Returns clean, valid, internal links.'
        return self.links


# Selecting relevant Links with the help of LLAMA 3.2 

In [12]:
import json
import ollama
import re

In [13]:
def filter_links_with_llama(links):
    "relevant link selection"

    prompt = f"""
    You are an AI assistant helping to create a company brochure. 
    Here is a list of web links:

    {json.dumps(links, indent=2)}

    Select only the most relevant links for the brochure, such as:
    - About Us
    - Vision & Mission
    - Leadership (Director, Officers, etc.)
    - Contact Information
    - Investor Relations (if available)
    - Careers (if available)
    - Services/Products
    Generate a **well-structured, engaging, and informative brochure** using this information.  
    The brochure **should be formatted in Markdown** for improved readability, including:
    - **Headings** (`#`, `##`, `###`)
    - **Bullet points** (`-` or `*`)
    - **Bold & Italics** for emphasis  
    - **Horizontal dividers (`---`)** between different sections for better readability wherever needed.
    - **Links** (if relevant)
    Do NOT include placeholders like "[Cover Page: ...]. Ensure the content flows naturally, as if designed for customers, investors, or stakeholders.
    """

    response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])

    # print("Raw Response:", response.message.content)

    links = re.findall(r'https?://\S+', response.message.content)
    
    return links

In [14]:
url = "https://abc.xyz/"
scraper = WebScraper(url)
extracted_links = scraper.links

filtered_links = filter_links_with_llama(extracted_links)
print("Filtered Brochure Links:", filtered_links)



Filtered Brochure Links: ['http://abc.xyz/investor)', 'http://abc.xyz/investor)', 'http://abc.xyz/investor)', 'http://abc.xyz/careers)']


# Fetching content from selected links

In [15]:
def fetch_page_content(url, max_chars=50000):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        if soup.body:
            for tag in soup.body(["script", "style", "img", "input"]):
                tag.decompose()
            text = soup.body.get_text(separator="\n", strip=True)
            return text[:max_chars]  # Truncate if needed
        return ""
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""


# Generating Brochure

In [20]:
def generate_brochure(company_name, links):
    all_content = []
    for link in links:
        print(f"Fetching content from: {link}")
        page_text = fetch_page_content(link)
        if page_text:
            all_content.append(f"### Source: {link}\n{page_text}\n")

    # context length for llama 3.2 : 131072 
    combined_text = "\n".join(all_content)[:500000]  # ~500K characters (safe margin)

    prompt = f"""
    You are an AI assistant tasked with generating a **concise and professional company brochure** for "{company_name}".
    Below are extracts from various web pages related to the company.

    Please generate a well-structured, engaging brochure suitable for customers, investors, and stakeholders. Ensure it should be professionally 
    written.
    Here is the extracted content:
    {combined_text}
    Generate a **well-structured, engaging, and informative brochure** using this information.The brochure **should be formatted in Markdown** for improved readability.
    **Do NOT include placeholders like "[Cover Page: ...]".**  **Ensure the content flows naturally, as if designed for customers, investors, or stakeholders.**  
    """
    
    response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])
    
    return response['message']['content']


In [21]:
company_name = "Google"
links=filtered_links
brochure_text = generate_brochure(company_name, links)

Fetching content from: http://abc.xyz/investor)
Error fetching http://abc.xyz/investor): 404 Client Error: Not Found for url: https://abc.xyz/investor)
Fetching content from: http://abc.xyz/investor)
Error fetching http://abc.xyz/investor): 404 Client Error: Not Found for url: https://abc.xyz/investor)
Fetching content from: http://abc.xyz/investor)
Error fetching http://abc.xyz/investor): 404 Client Error: Not Found for url: https://abc.xyz/investor)
Fetching content from: http://abc.xyz/careers)
Error fetching http://abc.xyz/careers): 404 Client Error: Not Found for url: https://abc.xyz/careers)


# BROCHURE

In [22]:
from IPython.display import display, Markdown

In [23]:
display(Markdown(brochure_text))  

# Google
## A Leader in Innovation and Technology

At Google, our mission is to organize the world's information and make it universally accessible and useful. We achieve this through our cutting-edge technology, innovative products, and collaborative approach.

### Our Story

Google was founded in 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University. Initially, the company focused on developing a search engine that could retrieve relevant information from the web. Over time, Google evolved into a multinational technology company with a diverse range of products and services.

### Our Products and Services

*   **Google Search**: The world's most popular search engine, providing accurate and up-to-date results.
*   **Google Ads**: A platform for businesses to reach their target audience through paid advertising.
*   **Google Cloud**: A suite of cloud computing services designed to help organizations build, deploy, and manage applications.
*   **Google Drive**: A cloud storage service allowing users to store and access files from anywhere.
*   **Google Pixel**: A series of smartphones offering exceptional camera performance and timely software updates.

### Our Values

*   **Innovation**: We empower our employees to think creatively and develop innovative solutions.
*   **Collaboration**: We foster a culture of teamwork and open communication.
*   **Quality**: We strive for excellence in everything we do, from product development to customer support.
*   **Integrity**: We operate with transparency, honesty, and accountability.

### Our Impact

At Google, we believe that technology can be a powerful force for good. We've made significant contributions to various industries, including:

*   **Artificial Intelligence (AI)**: We're investing heavily in AI research and development, with the goal of creating products that improve people's lives.
*   **Education**: Our initiatives focus on making quality education accessible to everyone, regardless of location or background.
*   **Environmental Sustainability**: We're committed to reducing our carbon footprint and promoting environmentally friendly practices.

### Join the Google Team

If you share our passion for innovation and collaboration, we invite you to join our team. With opportunities in various fields, including engineering, product management, and sales, there's something for everyone at Google.

### Stay Connected

Want to stay up-to-date on the latest news and developments from Google? Follow us on social media or visit our official website to learn more about our products, services, and initiatives.

*   [Twitter](https://twitter.com/google)
*   [Facebook](https://www.facebook.com/google)
*   [YouTube](https://www.youtube.com/google)

We look forward to working with you.

# Note
* Note that we have used LLAMA 3.2 trained on 3.2 Billion parameters.
* So it may not be as powerful as GPT-4 model by open AI which is trained on 1.8 trillion parameters
* When we use open AI api based model it gives results in different format with proper markdowns and headings. 