In [2]:
import requests
from google import genai 
from bs4 import BeautifulSoup
import requests.compat
import time

In [4]:
# Configure Gemini API
client = genai.Client(api_key="AIzaSyAnb4Nx6pi82_vyJnxhhq5Il6CnE9GcWdQ")
# genai.configure(api_key="AIzaSyAnb4Nx6pi82_vyJnxhhq5Il6CnE9GcWdQ")

In [9]:
def crawl_docs(base_url, allowed_prefix, visited=None, max_depth=2, depth=0, max_links=5):
    """
    Recursively crawls pages starting at base_url.

    For each page:
      1. Gets the entire HTML (response.text) and adds it to content.
      2. Finds all links on that page whose URL starts with allowed_prefix and that have not been visited.
      3. Limits the links to max_links.
      4. First recurses into the first subroute link encountered before processing remaining links.
    """
    if visited is None:
        visited = set()
    if depth > max_depth:
        return ""

    content = ""
    try:
        print(f"Crawling: {base_url}")
        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return ""

        # Save the full HTML content of the current page.
        current_html = response.text
        visited.add(base_url)

        # Parse the page to find subroute links.
        soup = BeautifulSoup(current_html, "html.parser")

        # Extract relevant content
        page_data = [tag.get_text(strip=True) for tag in soup.find_all(["h1", "h2", "h3", "p"])]
        content += "\n".join(page_data) + "\n"

        valid_links = []
        for link in soup.find_all("a", href=True):
            href = link["href"]
            # Resolve relative URLs.
            if not href.startswith("http"):
                href = requests.compat.urljoin(base_url, href)
            # Only add links that match the allowed prefix and have not been visited.
            if href.startswith(allowed_prefix) and href not in visited:
                valid_links.append(href)

        valid_links = valid_links[:max_links]

        if valid_links:
            # First, recursively follow the FIRST subroute encountered.
            content += crawl_docs(valid_links[0], allowed_prefix, visited, max_depth, depth + 1, max_links)
            # Then process any remaining subroute links.
            for link in valid_links[1:]:
                content += crawl_docs(link, allowed_prefix, visited, max_depth, depth + 1, max_links)
                time.sleep(0.5)  # be polite to the server
    except Exception as e:
        print(f"Error crawling {base_url}: {e}")
    return content

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/",
}
def fetch_html(url):
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return [tag.get_text(strip=True) for tag in soup.find_all(["h1", "h2", "h3", "p"])]
    else:
        return f"Failed to fetch data. Status Code: {response.status_code}"
    
url = "https://segment.com/docs/getting-started/"

In [10]:
segment_base = "https://segment.com/docs/"
mparticle_base = "https://docs.mparticle.com/guides/"
lytics_base = "https://docs.lytics.com/docs/"
zeotap_base = "https://docs.zeotap.com/home/en-us/"
segment_docs = crawl_docs(segment_base, segment_base, max_depth=2, max_links=5)
mparticle_docs = crawl_docs(mparticle_base, mparticle_base, max_depth=2, max_links=5)
lytics_docs = crawl_docs("https://docs.lytics.com/docs/developer-quickstart", lytics_base, max_depth=2, max_links=5)
zeotap_docs = crawl_docs(zeotap_base, zeotap_base, max_depth=1, max_links=2)
print("Crawling done")

Crawling: https://segment.com/docs/
Failed to fetch https://segment.com/docs/
Crawling: https://docs.mparticle.com/guides/
Crawling: https://docs.mparticle.com/guides/platform-guide/
Crawling: https://docs.mparticle.com/guides/glossary/
Crawling: https://docs.mparticle.com/guides/getting-started/create-an-input/
Crawling: https://docs.mparticle.com/guides/getting-started/start-capturing-data/
Crawling: https://docs.mparticle.com/guides/getting-started/connect-an-event-output/
Crawling: https://docs.mparticle.com/guides/getting-started/create-an-audience/
Crawling: https://docs.mparticle.com/guides/platform-guide/activity/
Crawling: https://docs.mparticle.com/guides/getting-started/connect-an-audience-output/
Crawling: https://docs.mparticle.com/guides/getting-started/transform/
Crawling: https://docs.mparticle.com/guides/personalization/introduction/
Crawling: https://docs.mparticle.com/guides/personalization/profiles/
Crawling: https://docs.mparticle.com/guides/personalization/audienc

In [5]:
# soup = BeautifulSoup(mparticle_docs, "html.parser")
# sdata = [tag.get_text(strip=True) for tag in soup.find_all(["h1", "h2", "h3", "p"])]

In [11]:
def save_data_to_file(data, filename):
    if data:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(data)
            print(f"Data saved to {filename}")
    else:
        print(f"No data to save for {filename}")

# Save each website's data separately in the root of the repo
save_data_to_file(segment_docs, "segment_data.txt")
save_data_to_file(mparticle_docs, "mparticle_data.txt")
save_data_to_file(lytics_docs, "lytics_data.txt")
save_data_to_file(zeotap_docs, "zeotap_data.txt")

No data to save for segment_data.txt
Data saved to mparticle_data.txt
Data saved to lytics_data.txt
Data saved to zeotap_data.txt


In [6]:

scraped_data = sdata 

if scraped_data:
    formatted_data = "\n".join(scraped_data)  # Convert list to string
else:
    print("Failed to scrape data.")


In [9]:
# Function to ask Gemini with strict limitations
def ask_chatbot(question):
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
    You are a support chatbot that strictly answers based on the provided documentation.
    If the question is unrelated or the answer is not in the data, respond with "I can only provide information from the Segment documentation."

    Documentation:
    {formatted_data}

    Question: {question}
    """

    response = model.generate_content(prompt)
    return response.text.strip()

In [11]:
# Example query
question = "How audiences are forwarded? Info of its patterns?"
answer = ask_chatbot(question)
print(answer)

In mParticle, an audience is a set of users who match a given set of criteria. When mParticle prepares to forward an audience, it is broken down into a series of messages about audience membership. Each message contains:

mParticle then translates these messages into a format that can be read by each audience output partner, and forwards them via HTTP API. Each output deals with audience information a little differently, depending on their data structure, but there are two main patterns.

Direct
Some audience output partners allow mParticle to either to directly create an audience (some call them âlistsâ, or âsegmentsâ) via their API, or at least to manage the membership of an existing audience. The end result will be an âaudienceâ in the partner system, containing as many identities from the original mParticle audience as the output can accept. mParticle will continue to update the membership of the audience in the partner system as users are added and removed. Email marke