# Extracting the links from website

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# excluded extension types
EXCLUDED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".zip", ".exe", ".mp4", ".mp3"}


class WebScraper:
    """
    A utility class to scrape a website and extract only internal valid links.
    """

    def __init__(self, url):
        self.url = url
        self.base_domain = urlparse(url).netloc  # Extract base domain (e.g., www.iiitnr.ac.in)
        self.links = []
        self.scrape_website()

    def scrape_website(self):
        """
        Fetches the webpage, parses it, and extracts only internal links.
        """
        try:
            response = requests.get(self.url, headers=HEADERS, verify=False)
            response.raise_for_status()  # Raises an error for HTTP issues
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract all <a> tags with href attribute
            raw_links = [link.get('href') for link in soup.find_all('a', href=True)]
            self.links = self.clean_links(raw_links)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {self.url}: {e}")

    def clean_links(self, links):
        """
        Cleans and filters extracted links:
        - Converts relative links to absolute
        - Removes fragments (#) and unwanted patterns
        - Filters out links that don't belong to the same base domain
        - Excludes links ending with undesired extensions
        """
        valid_links = []
        for link in links:
            # Convert relative URL to absolute
            absolute_url = urljoin(self.url, link)

            # Parse URL components
            parsed_url = urlparse(absolute_url)

            # Ensure the link belongs to the same base domain
            if parsed_url.netloc and parsed_url.netloc != self.base_domain:
                continue

            # Remove empty links or links starting with "#", "/", or "."
            if absolute_url in ("#", "/", ".") or link.startswith(("#", "/", ".")):
                continue

            # Exclude links with undesired extensions
            if any(parsed_url.path.lower().endswith(ext) for ext in EXCLUDED_EXTENSIONS):
                continue

            if not absolute_url.startswith(("http://", "https://")):
                continue  # Skip mailto, tel, javascript links


            valid_links.append(absolute_url)

        return list(set(valid_links))  # Remove duplicates

    def get_links(self):
        """
        Returns the list of clean, valid, and internal links.
        """
        return self.links


# Selecting relevant Links with the help of LLAMA 3.2 

In [2]:
import json
import ollama
import re

In [3]:
def filter_links_with_llama(links):
    """
    Sends extracted links to LLaMA 3.2 (via Ollama) and asks it to select the most relevant ones.
    """

    prompt = f"""
    You are an AI assistant helping to create a company brochure. 
    Here is a list of web links:

    {json.dumps(links, indent=2)}

    Select only the most relevant links for the brochure, such as:
    - About Us
    - Vision & Mission
    - Leadership (Director, Officers, etc.)
    - Contact Information
    - Investor Relations (if available)
    - Careers (if available)
    - Services/Products

    Do NOT include links related to privacy policies, login pages, or irrelevant content.

    Return ONLY a **plain list of links** (one per line) with NO extra text.
    """

    # Query LLaMA 3.2
    response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])

    # # Debug: Print the full response from LLaMA
    # print("Raw LLaMA Response:", response.message.content)

    # Extract links from response
    links = re.findall(r'https?://\S+', response.message.content)
    
    return links

In [4]:
url = "https://abc.xyz/"
scraper = WebScraper(url)
extracted_links = scraper.links

filtered_links = filter_links_with_llama(extracted_links)

# Print the final selected links
print("Filtered Brochure Links:", filtered_links)



Filtered Brochure Links: ['http://abc.xyz/investor']


# Fetching content from selected links

In [5]:
def fetch_page_content(url, max_chars=50000):
    """Fetches and cleans text content from a given webpage."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        if soup.body:
            for tag in soup.body(["script", "style", "img", "input"]):
                tag.decompose()
            text = soup.body.get_text(separator="\n", strip=True)
            return text[:max_chars]  # Truncate if needed
        return ""
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""


# Generating Brochure

In [6]:
def generate_brochure(company_name, links):
    """Generates a company brochure using LLaMA 3.2B based on extracted webpage content."""
    
    # Fetch content from each link
    all_content = []
    for link in links:
        print(f"Fetching content from: {link}")
        page_text = fetch_page_content(link)
        if page_text:
            all_content.append(f"### Source: {link}\n{page_text}\n")

    # Combine all content, ensuring it fits within context size
    combined_text = "\n".join(all_content)[:500000]  # Keep within ~500K characters (safe margin)

    # Define the prompt
    prompt = f"""
    You are an AI assistant tasked with creating a professional brochure for "{company_name}".
    Below are extracts from various web pages related to the company.

    Please generate a well-structured, engaging brochure suitable for customers, investors, and stakeholders.
    Ensure it includes:
    - A brief company introduction
    - Key offerings/services
    - Achievements or unique points
    - Contact details (if available)
    - Any other relevant sections

    Here is the extracted content:
    {combined_text}
    """

    # Send to LLaMA 3.2B
    response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])
    
    return response['message']['content']


In [10]:
company_name = "Google"
links=filtered_links
brochure_text = generate_brochure(company_name, links)

Fetching content from: http://abc.xyz/investor


# BROCHURE

In [8]:
from IPython.display import display, Markdown

In [9]:
display(Markdown(brochure_text))  

Here is a professionally designed brochure for Google:

[Cover Page: A stylized logo of Google]

**Unlocking Innovation and Connection**

Welcome to Google, the world's most innovative technology company. Our mission is to organize the world's information and make it universally accessible and useful.

**Our Story**

Google was founded in 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University. Our early success was fueled by our search engine, which quickly became the go-to platform for finding answers online. Today, we're a global leader in technology, with a diverse range of products and services that connect people, businesses, and ideas.

**Our Offerings**

We offer a wide range of products and services that aim to make the world a better place:

* **Search**: Our search engine is still the most popular platform for finding answers online.
* **Advertising**: We provide effective advertising solutions for businesses of all sizes.
* **Cloud Computing**: Our Google Cloud Platform offers a comprehensive suite of cloud-based services for businesses and developers.
* **Artificial Intelligence**: Our AI-powered products and services are transforming industries and revolutionizing the way we live and work.
* **Google Play**: Our app store provides access to millions of apps, games, and entertainment content.

**Achievements**

We've achieved numerous milestones and awards throughout our history, including:

* **Google's Market Value**: Over $1 trillion
* **Number of Employees**: Over 150,000 worldwide
* **Awards**: We've won over 200 patents and have been recognized as one of the world's most innovative companies by Forbes.

**Unique Points**

We're committed to making a positive impact on society. Here are some unique points about Google:

* **Sustainable Energy**: We aim to power 100% of our data centers with renewable energy.
* **Education**: We provide free online courses and resources for people around the world.
* **Accessibility**: We strive to make our products and services accessible to everyone, regardless of ability or disability.

**Stay Connected**

Want to stay up-to-date on the latest Google news and announcements? Subscribe to our email alerts and follow us on social media:

* **Email Alerts**: [Subscribe]
* **Social Media**: Follow us on Twitter, Facebook, Instagram, and LinkedIn.

[Back Cover: A stylized logo of Google]

**About Us**

Google is a subsidiary of Alphabet Inc., a holding company that owns multiple companies, including Google, YouTube, and Waymo. We're headquartered in Mountain View, California, with offices all around the world.

**Contact Us**

If you have any questions or feedback about Google, please don't hesitate to reach out:

* **Phone**: +1 (650) 253-0000
* **Email**: [info@google.com](mailto:info@google.com)
* **Address**: 1600 Amphitheatre Parkway, Mountain View, CA 94043

# Note
* Note that we have used LLAMA 3.2 trained on 3.2 Billion parameters.
* So it may not be as powerful as GPT-4 model by open AI which is trained on 1.8 trillion parameters
* When we use open AI api based model it gives results in different format with proper markdowns and headings. 