## Install Packages

In [1]:
!pip install -qU langchain langchain-openai langchain-community beautifulsoup4 faiss-cpu

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

## Config. Environment

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from google.colab import userdata

try:
    os.environ["AZURE_OPENAI_API_KEY"] = userdata.get('AZURE_OPENAI_API_KEY')
    os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get('AZURE_OPENAI_ENDPOINT')
    os.environ["AZURE_OPENAI_API_VERSION"] = userdata.get('OPENAI_API_VERSION')
    os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
    os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
    print("Azure credentials loaded successfully from Colab Secrets.")
except Exception as e:
    print(f"Could not load secrets. Please ensure you have added all required keys to the Colab Secrets manager. Error: {e}")

Azure credentials loaded successfully from Colab Secrets.


## Data Scraping

In [3]:
# URLs for the release notes
URLS = {
    "simplidots": "https://fitur-sap.simplidots.id/",
    "langflow": "https://api.github.com/repos/langflow-ai/langflow/releases",
    "anthropic": "https://docs.anthropic.com/en/release-notes/api",
    "chatgpt": "https://help.openai.com/en/articles/6825453-chatgpt-release-notes"
}

In [12]:
!apt-get update
!apt install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,152 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,269 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,160 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-backports InRe

### Headless Chrome Setup

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
from typing import List

# Langchain-like Document stub
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

### Web Scraper

In [34]:
def scrape_web_content(url, content_selector):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        content_div = soup.select_one(content_selector)
        if content_div:
            return content_div.get_text(separator='\n', strip=True)
        print(f"Content selector '{content_selector}' not found on {url}")
        return None
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

### GitHub Scraper

In [35]:
def scrape_github_releases(api_url):
    all_release_notes = ""
    try:
        response = requests.get(f"{api_url}?per_page=15", timeout=15)
        response.raise_for_status()
        releases = response.json()
        for release in releases:
            all_release_notes += f"## {release.get('name', 'Untitled Release')}\n\n{release.get('body', 'No description.')}\n\n---\n\n"
        return all_release_notes
    except requests.RequestException as e:
        print(f"Error fetching GitHub releases from {api_url}: {e}")
        return None

### SimpliDots Selenium + Link Crawler Scraper

In [36]:
from urllib.parse import urljoin, urlparse

def scrape_simplidots_with_selenium(base_url, max_depth=3):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    visited = set()
    result_texts = []

    def crawl(url, depth):
        if depth > max_depth or url in visited:
            return
        visited.add(url)
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            try:
                content = driver.find_element(By.TAG_NAME, "main").text.strip()
            except:
                content = driver.find_element(By.TAG_NAME, "body").text.strip()
            result_texts.append(f"URL: {url}\n\n{content}")

            # find internal links to crawl deeper
            anchors = driver.find_elements(By.TAG_NAME, "a")
            for a in anchors:
                href = a.get_attribute("href")
                if href and href.startswith(base_url):
                    crawl(href, depth + 1)
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")

    try:
        crawl(base_url, 1)
    finally:
        driver.quit()

    return "\n\n---\n\n".join(result_texts)

### Anthropic Selenium Scraper

In [37]:
import time

def scrape_anthropic_with_selenium(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        driver.implicitly_wait(10)
        time.sleep(15)  # wait for full load

        body = driver.find_element(By.TAG_NAME, "body")
        return body.text
    except Exception as e:
        print(f"Error using Selenium for Anthropic release notes: {e}")
        return None
    finally:
        driver.quit()

### ChatGPT Selenium Scraper

In [38]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_chatgpt_with_selenium(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "h2"))
        )

        content_div = driver.find_element(By.CSS_SELECTOR, "div.article-body")

        h2_elements = content_div.find_elements(By.TAG_NAME, "h2")
        all_notes = ""
        for h2 in h2_elements:
            section_title = h2.text.strip()
            all_notes += f"\n## {section_title}\n"
            siblings = []
            next_element = h2
            while True:
                try:
                    next_element = next_element.find_element(By.XPATH, 'following-sibling::*[1]')
                    if next_element.tag_name == "h2":
                        break
                    siblings.append(next_element)
                except:
                    break
            for sib in siblings:
                try:
                    if sib.tag_name in ["p", "ul", "ol"]:
                        all_notes += sib.text.strip() + "\n"
                except:
                    continue
        return all_notes.strip()
    except Exception as e:
        print(f"Error using Selenium for ChatGPT release notes: {e}")
        return None
    finally:
        driver.quit()

### Execute Scraping

In [39]:
print("Starting data scraping...")
all_documents: List[Document] = []

# SimpliDOTS via Selenium
simplidots_text = scrape_simplidots_with_selenium(URLS["simplidots"], max_depth=3)
if simplidots_text:
    all_documents.append(Document(page_content=simplidots_text, metadata={"source": URLS["simplidots"]}))
    print(f"Scraped SimpliDOTS: {len(simplidots_text)} characters")

# Langflow
langflow_text = scrape_github_releases(URLS["langflow"])
if langflow_text:
    all_documents.append(Document(page_content=langflow_text, metadata={"source": "https://github.com/langflow-ai/langflow/releases"}))
    print(f"Scraped Langflow: {len(langflow_text)} characters")

# Anthropic via Selenium
anthropic_text = scrape_anthropic_with_selenium(URLS["anthropic"])
if anthropic_text:
    all_documents.append(Document(page_content=anthropic_text, metadata={"source": URLS["anthropic"]}))
    print(f"Scraped Anthropic: {len(anthropic_text)} characters")

# ChatGPT via Selenium
chatgpt_text = scrape_chatgpt_with_selenium(URLS["chatgpt"])
if chatgpt_text:
    all_documents.append(Document(page_content=chatgpt_text, metadata={"source": URLS["chatgpt"]}))
    print(f"Scraped ChatGPT: {len(chatgpt_text)} characters")

Starting data scraping...
Failed to crawl https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2025: Message: stale element reference: stale element not found
  (Session info: chrome=138.0.7204.168); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#staleelementreferenceexception
Stacktrace:
#0 0x5c462cbd487a <unknown>
#1 0x5c462c6792e0 <unknown>
#2 0x5c462c68c95b <unknown>
#3 0x5c462c68b712 <unknown>
#4 0x5c462c680839 <unknown>
#5 0x5c462c680965 <unknown>
#6 0x5c462c67ea7f <unknown>
#7 0x5c462c682a6a <unknown>
#8 0x5c462c71775e <unknown>
#9 0x5c462c6f08b2 <unknown>
#10 0x5c462c71671c <unknown>
#11 0x5c462c6f0683 <unknown>
#12 0x5c462c6bcb5b <unknown>
#13 0x5c462c6bdf31 <unknown>
#14 0x5c462cb997cb <unknown>
#15 0x5c462cb9d5d4 <unknown>
#16 0x5c462cb802c9 <unknown>
#17 0x5c462cb9e178 <unknown>
#18 0x5c462cb646bf <unknown>
#19 0x5c462cbc1e78 <unknown>
#20 0x5c462cbc2056 <unknown>
#21 0x5c462cbd3b96 <

In [40]:
if simplidots_text:
    words = simplidots_text.split()
    display(" ".join(words[:100]))
else:
    display("simplidots text was not scraped successfully.")

'URL: https://fitur-sap.simplidots.id/ Copy SMH Fitur pada SMH (Sales Management Hub) Temukan penjelasan mengenai fitur-fitur terbaru di 2024 2023 2025 Next 2025 Last updated 2 months ago Was this helpful? --- URL: https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2025 Copy SMH FITUR PADA SMH (SALES MANAGEMENT HUB) 2025 🔜 Live Mode Kini Dilengkapi Opsi Reset atau Tidak Reset Data - 31 Juli 2025 🔥 [Beta] - Integrasi Sales Invoice SimpliDOTS x Accurate Online - [17 Juli 2025] 🔥 Perbaikan Pemilihan Gudang pada Buat Sales Invoice - [11 Juli 2025] 🔥 Penambahan Fitur Collection - [03 July 2025] New Feature: Tanya AI 🚀 Penambahan Fitur Customer Limit -'

In [33]:
if langflow_text:
    words = langflow_text.split()
    display(" ".join(words[:100]))
else:
    display("Langflow text was not scraped successfully.")

"## 1.5.0.post1 <!-- Release notes generated using configuration in .github/release.yml at 1.5.0.post1 --> ## What's Changed ### ✨ New Features * feat: Add dynamic theming support to WatsonxAI icon by @Cristhianzl in https://github.com/langflow-ai/langflow/pull/8935 * feat: jigsawstack bundle integration by @Khurdhula-Harshavardhan in https://github.com/langflow-ai/langflow/pull/8832 * feat: enhance DataFrame Operations component with contains filter and modern UI by @rodrigosnader in https://github.com/langflow-ai/langflow/pull/8838 * feat: add DataFrame output to Structured Output component by @rodrigosnader in https://github.com/langflow-ai/langflow/pull/8842 ### 🐛 Bug Fixes * fix: Improve modal layout responsiveness and overflow handling by @Cristhianzl in https://github.com/langflow-ai/langflow/pull/8936 * fix: Improve flow export error handling and validation by @Cristhianzl in"

In [31]:
if anthropic_text:
    words = anthropic_text.split()
    display(" ".join(words[:100]))
else:
    display("Anthropic text was not scraped successfully.")

'Anthropic home page English Search... Navigation Release Notes API RELEASE NOTES API Copy page Follow along with updates across Anthropic’s API and Developer Console. July 28, 2025 We’ve released text_editor_20250728, an updated text editor tool that fixes some issues from the previous versions and adds an optional max_characters parameter that allows you to control the truncation length when viewing large files. July 24, 2025 We’ve increased rate limits for Claude Opus 4 on the Anthropic API to give you more capacity to build and scale with Claude. For customers with usage tier 1-4 rate limits, these changes apply immediately to'

## Data Ingestion

### Chunking Process

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
)

print("Chunking documents...")
chunked_docs = text_splitter.split_documents(all_documents)
print(f"Documents chunked successfully. Total chunks: {len(chunked_docs)}")

Chunking documents...
Documents chunked successfully. Total chunks: 212


### Data Embedding

In [28]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import math
import time

print("Initializing Azure OpenAI Embeddings model...")
azure_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
print("Embedding model initialized.")

# Batch processing and embedding
batch_size = 1000
total_chunks = len(chunked_docs)
vector_store = None # Initialize vector_store to None

if total_chunks > 0:
    num_batches = math.ceil(total_chunks / batch_size)
    print(f"\nStarting embedding process in {num_batches} batches of size {batch_size}...")

    for i in range(0, total_chunks, batch_size):
        batch_number = (i // batch_size) + 1
        start_time = time.time()

        # Get the current batch of documents
        batch_docs = chunked_docs[i:i + batch_size]
        print(f"  - Processing Batch {batch_number}/{num_batches} ({len(batch_docs)} chunks)...")

        if vector_store is None:
            # For the first batch, create the FAISS index
            vector_store = FAISS.from_documents(batch_docs, azure_embeddings)
            print("    - Initial FAISS index created.")
        else:
            # For subsequent batches, add them to the existing index
            vector_store.add_documents(batch_docs)
            print("    - Batch added to existing FAISS index.")

        end_time = time.time()
        print(f"  - Batch {batch_number} finished in {end_time - start_time:.2f} seconds.")

    print("\nAll batches have been processed and embedded.")

    # Save the completed vector store
    vector_store.save_local("faiss_index_release_notes")
    print("Vector store saved to Colab's local directory: 'faiss_index_release_notes'")
else:
    print("No documents were chunked. Skipping embedding process.")

Initializing Azure OpenAI Embeddings model...
Embedding model initialized.

Starting embedding process in 1 batches of size 1000...
  - Processing Batch 1/1 (212 chunks)...
    - Initial FAISS index created.
  - Batch 1 finished in 4.77 seconds.

All batches have been processed and embedded.
Vector store saved to Colab's local directory: 'faiss_index_release_notes'


## Initialize RAG System

### Initialize LLM

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Explicitly get and set the variables
azure_endpoint = userdata.get('AZURE_OPENAI_ENDPOINT')
azure_deployment = userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
api_key = userdata.get('AZURE_OPENAI_API_KEY')
api_version = userdata.get('OPENAI_API_VERSION')

llm = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    azure_deployment=azure_deployment,
    api_key=api_key,
    api_version=api_version,
    model=f"azure/{userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')}"
)

print("LLM initialized.")

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={'k': 4})
print("Retriever created.")

### RAG Chain

In [None]:
prompt_template = """
You are an intelligent assistant for querying software release notes.
Use only the following retrieved context to answer the user's question.
If you don't know the answer from the context provided, just say "I don't have enough information from the release notes to answer that."
Do not make up information. Be concise.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

def format_docs(docs):
    return "\n\n".join(f"Source: {doc.metadata.get('source', 'N/A')}\nContent: {doc.page_content}" for doc in docs)

# Create the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created. Ready to answer questions.")

## LLM Testing

In [None]:
query = "What were some of the bug fixes in Langflow recently?"
print(f"Question: {query}")

answer = rag_chain.invoke(query)

print("\nAnswer:")
print(answer)