In [37]:
import requests
import google.generativeai as genai
from bs4 import BeautifulSoup
import requests.compat
import time
from selenium import webdriver
import html
from dotenv import load_dotenv
import os
from urllib.parse import urlparse, urljoin

In [38]:
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
driver = webdriver.Chrome()
genai.configure(api_key=api_key)

In [39]:
def crawl_docs(base_url, allowed_prefix, visited=None, max_depth=None, depth=0, max_links=None):
    """
    Recursively crawls pages starting at base_url.

    For each page:
      1. Loads the page using a driver and gets the rendered HTML.
      2. Extracts text from h1, h2, h3, and p tags.
      3. Finds and processes links starting with allowed_prefix, limiting them to max_links.
      4. First recurses into the first valid link before processing the others.
    """
    if visited is None:
        visited = set()
    if base_url in visited or (max_depth is not None and depth > max_depth):
        return ""

    visited.add(base_url)
    print(f"Crawling: {base_url}")

    content_parts = []
    try:
        driver.get(base_url)
        time.sleep(0.5)
        current_html = driver.page_source
    except Exception as e:
        print(f"Error loading {base_url}: {e}")
        return ""

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Referer": "https://www.google.com/",
    }
    try:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return ""
    except Exception as e:
        print(f"Error fetching {base_url}: {e}")
        return ""

    decoded_html = html.unescape(current_html)
    soup = BeautifulSoup(decoded_html, "html.parser")

    page_data = [tag.get_text(strip=True) for tag in soup.find_all(["h1", "h2", "h3", "p"])]
    content_parts.append("\n".join(page_data))

    valid_links = []
    for link in soup.find_all("a", href=True):
        href = urljoin(base_url, link["href"])
        href = urlparse(href)._replace(fragment="").geturl()
        if not href.startswith("http"):
            href = requests.compat.urljoin(base_url, href)
        if href.startswith(allowed_prefix) and href not in visited:
            valid_links.append(href)
            if max_links is not None and len(valid_links) >= max_links:
                break

    if valid_links:
        content_parts.append(crawl_docs(valid_links[0], allowed_prefix, visited, max_depth, depth + 1, max_links))
        for link in valid_links[1:]:
            content_parts.append(crawl_docs(link, allowed_prefix, visited, max_depth, depth + 1, max_links))

    return "\n".join(content_parts)


In [16]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/",
}
def fetch_html(url):
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return [tag.get_text(strip=True) for tag in soup.find_all(["h1", "h2", "h3", "p"])]
    else:
        return f"Failed to fetch data. Status Code: {response.status_code}"
    
url = "https://segment.com/docs/getting-started/"

In [41]:
segment_base = "https://segment.com/docs/"
mparticle_base = "https://docs.mparticle.com/guides/"
lytics_base = "https://docs.lytics.com/docs/"
zeotap_base = "https://docs.zeotap.com/home/en-us/"
segment_docs = crawl_docs(segment_base, segment_base, max_depth=None, max_links=None)
# mparticle_docs = crawl_docs(mparticle_base, mparticle_base, max_depth=2, max_links=5)
# lytics_docs = crawl_docs("https://docs.lytics.com/docs/developer-quickstart", lytics_base, max_depth=None, max_links=None)
# zeotap_docs = crawl_docs(zeotap_base, "https://docs.zeotap.com/", max_depth=None, max_links=None)
print("Crawling done")

Crawling: https://segment.com/docs/
Crawling: https://segment.com/docs/getting-started/
Crawling: https://segment.com/docs/getting-started/01-what-is-segment/
Crawling: https://segment.com/docs/getting-started/implementation-guide/
Crawling: https://segment.com/docs/getting-started/02-simple-install/
Crawling: https://segment.com/docs/getting-started/03-planning-full-install/
Crawling: https://segment.com/docs/getting-started/04-full-install/
Crawling: https://segment.com/docs/getting-started/05-data-to-destinations/
Crawling: https://segment.com/docs/getting-started/06-testing-debugging/
Crawling: https://segment.com/docs/getting-started/whats-next/
Crawling: https://segment.com/docs/getting-started/use-cases//
Crawling: https://segment.com/docs/getting-started/use-cases/
Crawling: https://segment.com/docs/getting-started/use-cases/guide//
Crawling: https://segment.com/docs/getting-started/use-cases/guide/
Crawling: https://segment.com/docs/getting-started/use-cases/setup//
Crawling: 

In [42]:
def save_data_to_file(data, filename):
    if data:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(data)
            print(f"Data saved to {filename}")
    else:
        print(f"No data to save for {filename}")

save_data_to_file(segment_docs, "segment_data.txt")
# save_data_to_file(mparticle_docs, "mparticle_data.txt")
# save_data_to_file(lytics_docs, "lytics_data.txt")
# save_data_to_file(zeotap_docs, "zeotap_data.txt")

Data saved to segment_data.txt


In [6]:

scraped_data = sdata 

if scraped_data:
    formatted_data = "\n".join(scraped_data)
else:
    print("Failed to scrape data.")


In [9]:
def ask_chatbot(question):
    model = genai.GenerativeModel(model_name='gemini-1.5-flash')

    prompt = f"""
    You are a support chatbot that strictly answers based on the provided documentation.
    If the question is unrelated or the answer is not in the data, respond with "I can only provide information from the Segment documentation."

    Documentation:
    {formatted_data}

    Question: {question}
    """

    response = model.generate_content(prompt)
    return response.text.strip()

In [None]:
# Example query
question = "How audiences are forwarded? Info of its patterns?"
answer = ask_chatbot(question)
print(answer)