## Install Packages

In [16]:
!pip install -qU langchain langchain-openai langchain-community beautifulsoup4 faiss-cpu selenium selenium-wire undetected-chromedriver blinker==1.4

## Library

In [17]:
import os
import re
import math
import time
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date
from langchain.docstore.document import Document
from google.colab import userdata
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

## Config. Environment

In [18]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from google.colab import userdata

try:
    os.environ["AZURE_OPENAI_API_KEY"] = userdata.get('AZURE_OPENAI_API_KEY')
    os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get('AZURE_OPENAI_ENDPOINT')
    os.environ["AZURE_OPENAI_API_VERSION"] = userdata.get('OPENAI_API_VERSION')
    os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
    os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
    print("Azure credentials loaded successfully from Colab Secrets.")
except Exception as e:
    print(f"Could not load secrets. Please ensure you have added all required keys to the Colab Secrets manager. Error: {e}")

Azure credentials loaded successfully from Colab Secrets.


## Data Scraping

In [19]:
# URLs for the release notes
URLS = {
    "simplidots": "https://fitur-sap.simplidots.id/",
    "langflow": "https://api.github.com/repos/langflow-ai/langflow/releases",
    "anthropic": "https://docs.anthropic.com/en/release-notes/api"
}

In [20]:
!pip install -q selenium undetected-chromedriver
!sudo apt-get update -y
!sudo apt-get install -y chromium-chromedriver
!sudo cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (3.171.85.15)] [Conn                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
0% [Connected to cloud.r-project.org (3.171.85.15)] [Connected to r2u.stat.illi                                                                               Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://p

### Headless Chrome Setup

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
from typing import List

# Langchain-like Document stub
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

### Web Scraper

### GitHub Scraper

In [22]:
def scrape_github_releases(api_url):
    """
    Scrapes GitHub releases and returns a list of Document objects,
    each with its content and release date in the metadata.
    """
    documents = []
    try:
        response = requests.get(f"{api_url}?per_page=15", timeout=15)
        response.raise_for_status()
        releases = response.json()
        for release in releases:
            content = f"## {release.get('name', 'Untitled Release')}\n\n{release.get('body', 'No description.')}"

            # Get the release date directly from the API response
            release_date = release.get('published_at', '')

            # Create a Document for each release
            doc = Document(
                page_content=content,
                metadata={
                    "source": "https://github.com/langflow-ai/langflow/releases",
                    "release_date": release_date.split('T')[0] if release_date else 'unknown' # Format as YYYY-MM-DD
                }
            )
            documents.append(doc)
        return documents
    except requests.RequestException as e:
        print(f"Error fetching GitHub releases from {api_url}: {e}")
        return []

### SimpliDots Selenium + Link Crawler Scraper

In [23]:
from urllib.parse import urljoin, urldefrag
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, NoSuchElementException
from dateutil.parser import parse as parse_date
import re

def scrape_simplidots_with_selenium(base_url, max_depth=2):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    # Gather all unique links robustly ---
    links_to_crawl = {base_url}
    crawled_links = set()

    for depth in range(max_depth):
        current_links = list(links_to_crawl - crawled_links)
        if not current_links:
            break
        for link in current_links:
            print(f"Finding links on (depth {depth+1}): {link}")
            crawled_links.add(link)
            try:
                driver.get(link)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@href]")))
                # Re-find elements within a loop to handle stale references
                for _ in range(3): # Retry mechanism
                    try:
                        hrefs = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[@href]")]
                        for href in hrefs:
                            if href:
                                full_url = urldefrag(urljoin(link, href))[0]
                                if full_url.startswith(base_url):
                                    links_to_crawl.add(full_url)
                        break # Success, exit retry loop
                    except StaleElementReferenceException:
                        time.sleep(0.5)
            except Exception as e:
                print(f"Could not process links on {link}: {e}")

    print(f"\nFound {len(links_to_crawl)} unique links. Starting content extraction...")

    # Extract content from each link ---
    all_documents = []
    month_map = { 'januari': 'january', 'februari': 'february', 'maret': 'march', 'april': 'april', 'mei': 'may', 'juni': 'june', 'juli': 'july', 'agustus': 'august', 'september': 'september', 'oktober': 'october', 'november': 'november', 'desember': 'december' }

    for url in links_to_crawl:
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "main")))

            full_title = driver.title
            release_date = 'unknown'

            # More flexible date regex, handles optional brackets
            date_pattern = r"\[?\s*(\d{1,2}\s+[A-Za-z]+\s+\d{4})\s*\]?"
            match = re.search(date_pattern, full_title, re.IGNORECASE)

            if match:
                date_str = match.group(1).lower()
                for indo, eng in month_map.items(): date_str = date_str.replace(indo, eng)
                release_date = parse_date(date_str).strftime('%Y-%m-%d')

            title = re.sub(date_pattern, '', full_title, flags=re.IGNORECASE).split('|')[0].replace(' -', '').strip()

            index_titles = ["Fitur pada SMH (Sales Management Hub)", "Fitur pada Canvass", "Fitur pada RO (Route Optimization)", "Feature Updates Sales Automation Platform"]
            if title in index_titles or len(title) < 10: continue

            content = driver.find_element(By.TAG_NAME, "main").text.strip()

            doc = Document(page_content=content, metadata={"source": url, "title": title, "release_date": release_date})
            all_documents.append(doc)
            print(f"  -> Processed: {title} (Date: {release_date})")
        except Exception:
            continue

    driver.quit()
    return all_documents

### Anthropic Selenium Scraper

In [24]:

def scrape_anthropic_with_selenium(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h4")))
        body = driver.find_element(By.TAG_NAME, "body")
        return body.text
    except Exception as e:
        print(f"Error using Selenium for Anthropic release notes: {e}")
        return None
    finally:
        driver.quit()

### Execute Scraping

In [25]:
print("Memulai data scraping...")
all_documents = []

simplidots_docs = scrape_simplidots_with_selenium(URLS["simplidots"])
if simplidots_docs:
    all_documents.extend(simplidots_docs)
    print(f"Berhasil scrape SimpliDOTS: {len(simplidots_docs)} dokumen")
else:
    print("Gagal scrape SimpliDOTS.")

Memulai data scraping...
Finding links on (depth 1): https://fitur-sap.simplidots.id/
Finding links on (depth 2): https://fitur-sap.simplidots.id/sfa/fitur-pada-sfa-sales-force-automation
Finding links on (depth 2): https://fitur-sap.simplidots.id/sfa/fitur-pada-sfa-sales-force-automation/pembaharuan-sfa-versi-3.1.1-31-oct-2024
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2024/penambahan-fitur-brand-pada-produk-19-feb-2024
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2025/perbaikan-pemilihan-gudang-pada-buat-sales-invoice-11-juli-2025
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2025/update-mengenai-coretax-format-xml-ppn-11-phase-1
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2023/adjustment-pada-daily-visit-map-geografis-kunjungan-22-sept-2023
Finding links on (depth 

In [26]:
langflow_docs = scrape_github_releases(URLS["langflow"])
if langflow_docs:
    all_documents.extend(langflow_docs)
    print(f"Berhasil scrape Langflow: {len(langflow_docs)} dokumen rilis")
else:
    print("Gagal scrape Langflow.")

Berhasil scrape Langflow: 15 dokumen rilis


In [27]:
anthropic_html = scrape_anthropic_with_selenium(URLS["anthropic"])
if anthropic_html:
    all_documents.append(Document(page_content=anthropic_html, metadata={"source": URLS["anthropic"]}))
    print(f"Scraped Anthropic: {len(anthropic_html)} characters of HTML")
else:
    print("Failed to scrape Anthropic.")

print(f"\nTotal dokumen yang berhasil di-scrape: {len(all_documents)}")

Scraped Anthropic: 12752 characters of HTML

Total dokumen yang berhasil di-scrape: 95


In [28]:
simplidots_docs_found = [doc for doc in all_documents if "simplidots" in doc.metadata.get("source", "")]

if simplidots_docs_found:
    print(f"Found {len(simplidots_docs_found)} SimpliDOTS documents.")
    # Display the content of the first few SimpliDOTS documents as an example
    for i, doc in enumerate(simplidots_docs_found[:3]): # Display content of first 3 docs
        print(f"\n--- SimpliDOTS Document {i+1} ---")
        print(f"Title: {doc.metadata.get('title', 'N/A')}")
        print(f"Date: {doc.metadata.get('release_date', 'N/A')}")
        words = doc.page_content.split()
        display(" ".join(words[:100])) # Display first 100 words of each doc
else:
    display("No SimpliDOTS documents were scraped successfully.")

Found 79 SimpliDOTS documents.

--- SimpliDOTS Document 1 ---
Title: Update Mengenai Coretax Format XML PPN 11% [Phase 1]
Date: unknown


"Copy SMH FITUR PADA SMH (SALES MANAGEMENT HUB) 2025 🚀 Update Mengenai Coretax Format XML PPN 11% - [Phase 1] What's new?! Dalam rangka mengikuti kebijakan pemerintah yang telah mewajibkan penggunaan Coretax, SimpliDOTS kini mendukung format file XML dan Excel untuk mempermudah pengguna dalam melakukan penyesuaian data pajak. Berikut detailnya: Dukungan XML dan Excel dengan Format PPN 11% Saat ini, sesuai dengan PER-1/PJ/2025, Pengusaha Kena Pajak (PKP) masih diperbolehkan menerbitkan faktur dengan PPN 11% hingga Maret 2025. Oleh karena itu, SimpliDOTS menyediakan file XML dan Excel dengan format 11% untuk membantu proses upload faktur, sembari kami mempersiapkan sistem untuk mendukung"


--- SimpliDOTS Document 2 ---
Title: Penambahan Fitur Brand pada Produk
Date: 2024-02-19


"Copy SMH FITUR PADA SMH (SALES MANAGEMENT HUB) 2024 🚀 Penambahan Fitur Brand pada Produk - [19 Feb 2024] Halo, Kawan Simpli! Ada kabar yang perlu kamu tahu, nih! Kini, terdapat penambahan fitur baru yaitu Brand agar dapat memudahkan proses manajamen dan kategorisasi master data produk yang lebih baik pada Sales Management Hub (SMH). What's New? Terdapat penambahan fitur Brand agar dapat mengisi data produk yang lebih lengkap dan terdefinisi dengan baik sehingga memudahkan proses manajemen dan kategorisasi produk. Menu Brand terdapat pada submenu Master Data> Produk> Brand. Fitur ini diharapkan dapat membuat user lebih mudah dalam mengelompokkan data produk"


--- SimpliDOTS Document 3 ---
Title: Update Mengenai Coretax Format XML PPN 12%- [Phase 2]
Date: unknown


"Copy SMH FITUR PADA SMH (SALES MANAGEMENT HUB) 2025 🚀 Update Mengenai Coretax Format XML PPN 12%- [Phase 2] Halo, Kawan Simpli! Ada kabar bahagia yang perlu kamu tahu, nih! Kini, terdapat penambahan dan pembaharuan fitur berkaitan dengan coretax agar dapat mempermudah user dalam menggunakan sistem SimpliDOTS. What's new? Pada release sebelumnya, SimpliDOTS telah mendukung export XML dan Excel dengan Format PPN 11% (baca lebih lanjut Phase 1 di sini). Pada release kali ini, SimpliDOTS sudah mendukung kebutuhan PPN 12% serta pembuatan file XML dan Excel dengan format 12% untuk membantu proses upload faktur pengeluaran. Yuk, baca lebih lanjut updatenya!"

In [29]:
langflow_docs_found = [doc for doc in all_documents if "github.com/langflow-ai/langflow" in doc.metadata.get("source", "")]

if langflow_docs_found:
    print(f"Found {len(langflow_docs_found)} Langflow documents.")
    # Display the content of the first few Langflow documents as an example
    for i, doc in enumerate(langflow_docs_found[:3]):
        print(f"\n--- Langflow Document {i+1} ---")
        words = doc.page_content.split()
        display(" ".join(words[:500]))
else:
    display("No Langflow documents were scraped successfully.")

Found 15 Langflow documents.

--- Langflow Document 1 ---


"## 1.5.0.post1 <!-- Release notes generated using configuration in .github/release.yml at 1.5.0.post1 --> ## What's Changed ### ✨ New Features * feat: Add dynamic theming support to WatsonxAI icon by @Cristhianzl in https://github.com/langflow-ai/langflow/pull/8935 * feat: jigsawstack bundle integration by @Khurdhula-Harshavardhan in https://github.com/langflow-ai/langflow/pull/8832 * feat: enhance DataFrame Operations component with contains filter and modern UI by @rodrigosnader in https://github.com/langflow-ai/langflow/pull/8838 * feat: add DataFrame output to Structured Output component by @rodrigosnader in https://github.com/langflow-ai/langflow/pull/8842 ### 🐛 Bug Fixes * fix: Improve modal layout responsiveness and overflow handling by @Cristhianzl in https://github.com/langflow-ai/langflow/pull/8936 * fix: Improve flow export error handling and validation by @Cristhianzl in https://github.com/langflow-ai/langflow/pull/8943 * fix: make deletion of single file commit to DB, cre


--- Langflow Document 2 ---


"## 1.5.0 <!-- Release notes generated using configuration in .github/release.yml at refs/heads/release-1.5.0 --> ## What's Changed ### ✨ New Features * feat: deprecate processing components by @edwinjosechittilappilly in https://github.com/langflow-ai/langflow/pull/7254 * feat: add rss component by @edwinjosechittilappilly in https://github.com/langflow-ai/langflow/pull/8134 * feat: New Web search component by @edwinjosechittilappilly in https://github.com/langflow-ai/langflow/pull/8135 * feat: Adds our first Cursor rules by @mfortman11 in https://github.com/langflow-ai/langflow/pull/7973 * feat: adds new Edit Details popover, removes flow menu, fixes nav alignment, adds new Flow Status overlay by @lucaseduoli in https://github.com/langflow-ai/langflow/pull/8087 * feat: Enhance API request component by @edwinjosechittilappilly in https://github.com/langflow-ai/langflow/pull/8070 * feat: add datastax components bundle by @erichare in https://github.com/langflow-ai/langflow/pull/8184 * 


--- Langflow Document 3 ---


'## 1.4.3 <!-- Release notes generated using configuration in .github/release.yml at refs/heads/1.4.3-release --> **Full Changelog**: https://github.com/langflow-ai/langflow/compare/1.4.2...1.4.3'

In [31]:
# Find the Anthropic document in the all_documents list
anthropic_doc = next((doc for doc in all_documents if doc.metadata.get("source") == URLS["anthropic"]), None)

if anthropic_doc:
    words = anthropic_doc.page_content.split()
    display(" ".join(words[:100]))
else:
    display("Anthropic document was not found in all_documents.")

'Anthropic home page English Search... Navigation Release Notes API RELEASE NOTES API Copy page Follow along with updates across Anthropic’s API and Developer Console. July 28, 2025 We’ve released text_editor_20250728, an updated text editor tool that fixes some issues from the previous versions and adds an optional max_characters parameter that allows you to control the truncation length when viewing large files. July 24, 2025 We’ve increased rate limits for Claude Opus 4 on the Anthropic API to give you more capacity to build and scale with Claude. For customers with usage tier 1-4 rate limits, these changes apply immediately to'

## Data Preprocessing

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dateutil.parser import parse as parse_date # Ensure parse_date is imported
import re # Ensure re is imported

def clean_text(text):
    """A simple function to clean text artifacts."""
    text = re.sub(r'\n\s*\n', '\n\n', text)
    artifacts = ["Was this helpful?", "Powered by GitBook", "Copy", "Next", "Previous", "Last updated"]
    for artifact in artifacts:
        text = text.replace(artifact, "")
    return text.strip()

def extract_and_format_date(text):
    """
    Finds a date, translates Indonesian months, and returns a datetime object.
    Returns None if no date is found or parsing fails.
    """
    # Mapping for Indonesian to English months
    month_map = {
        'januari': 'january', 'februari': 'february', 'maret': 'march', 'april': 'april',
        'mei': 'may', 'juni': 'june', 'juli': 'july', 'agustus': 'august',
        'september': 'september', 'oktober': 'october', 'november': 'november', 'desember': 'december'
    }

    # Regex to find dates with either English or Indonesian month names
    # This pattern is designed to find dates in various formats within the text
    date_pattern = r"(?i)(\d{1,2}\s+(?:Jan(?:uari)?|Feb(?:ruari)?|Mar(?:et)?|Apr(?:il)?|Mei|Jun(?:i)?|Jul(?:i)?|Agu(?:stus)?|Sep(?:tember)?|Okt(?:ober)?|Nov(?:ember)?|Des(?:ember)?)\s+\d{4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?\s+\d{4})"

    match = re.search(date_pattern, text)
    if match:
        try:
            date_str = match.group(0).lower()
            # Translate month if it's Indonesian
            for indo, eng in month_map.items():
                date_str = date_str.replace(indo, eng)

            # Parse the cleaned date string
            parsed_date = parse_date(date_str)
            return parsed_date
        except (ValueError, TypeError):
            return None
    return None


print("Starting simplified chunking and cleaning...")

# Langsung lakukan chunking pada semua dokumen yang berhasil di-scrape
# Use chunked_docs which already contains documents from all sources
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
final_chunks = text_splitter.split_documents(all_documents)


# Lakukan pembersihan sederhana pada setiap chunk
for chunk in final_chunks:
    chunk.page_content = clean_text(chunk.page_content)
    # Coba ekstrak date from content and add to metadata if not already present
    if 'release_date' not in chunk.metadata or chunk.metadata['release_date'] == 'unknown':
        extracted_date = extract_and_format_date(chunk.page_content)
        if extracted_date:
             chunk.metadata['release_date'] = extracted_date.strftime('%Y-%m-%d')
        else:
             chunk.metadata['release_date'] = 'unknown' # Ensure date is always present


# Filter chunk yang sangat pendek
processed_docs = [chunk for chunk in final_chunks if len(chunk.page_content) > 50]

print(f"Preprocessing complete. Total chunks ready for embedding: {len(processed_docs)}")

Starting simplified chunking and cleaning...
Preprocessing complete. Total chunks ready for embedding: 621


In [35]:
print("\nExample of a PROCESSED SimpliDots chunk metadata:")
for doc in processed_docs:
    if "simplidots" in doc.metadata["source"]:
        print(doc.metadata)
        break

print("\nExample of a PROCESSED Anthropic chunk metadata:")
for doc in processed_docs:
    if "anthropic" in doc.metadata["source"]:
        print(doc.metadata)
        break


Example of a PROCESSED SimpliDots chunk metadata:
{'source': 'https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2025/update-mengenai-coretax-format-xml', 'title': 'Update Mengenai Coretax Format XML PPN 11% [Phase 1]', 'release_date': 'unknown'}

Example of a PROCESSED Anthropic chunk metadata:
{'source': 'https://docs.anthropic.com/en/release-notes/api', 'release_date': '2025-07-28'}


## Data Ingestion

### Chunking Process

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
)

print("Chunking documents...")
chunked_docs = text_splitter.split_documents(all_documents)
print(f"Documents chunked successfully. Total chunks: {len(chunked_docs)}")

Chunking documents...
Documents chunked successfully. Total chunks: 635


## Data Preprocessing

### Format Release Date

In [38]:
def extract_and_format_date(text):
    """
    Finds a date, translates Indonesian months, and returns a formatted string.
    """
    # Mapping for Indonesian to English months
    month_map = {
        'januari': 'january', 'februari': 'february', 'maret': 'march', 'april': 'april',
        'mei': 'may', 'juni': 'june', 'juli': 'july', 'agustus': 'august',
        'september': 'september', 'oktober': 'october', 'november': 'november', 'desember': 'december'
    }

    # Regex to find dates with either English or Indonesian month names
    date_pattern = r"(?i)(\d{1,2}\s+(?:Jan(?:uari)?|Feb(?:ruari)?|Mar(?:et)?|Apr(?:il)?|Mei|Jun(?:i)?|Jul(?:i)?|Agu(?:stus)?|Sep(?:tember)?|Okt(?:ober)?|Nov(?:ember)?|Des(?:ember)?)\s+\d{4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?\s+\d{4})"

    match = re.search(date_pattern, text)
    if match:
        try:
            date_str = match.group(0).lower()
            # Translate month if it's Indonesian
            for indo, eng in month_map.items():
                date_str = date_str.replace(indo, eng)

            # Parse the cleaned date string and format it
            parsed_date = parse_date(date_str)
            return parsed_date
        except (ValueError, TypeError):
            return None
    return None

### Data Embedding

In [39]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import math
import time

print("Initializing Azure OpenAI Embeddings model...")
azure_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
print("Embedding model initialized.")

batch_size = 1000
total_chunks = len(processed_docs)
vector_store = None

if total_chunks > 0:
    num_batches = math.ceil(total_chunks / batch_size)
    print(f"\nStarting embedding process for {total_chunks} PROCESSED chunks in {num_batches} batches...")

    for i in range(0, total_chunks, batch_size):
        batch_number = (i // batch_size) + 1
        start_time = time.time()

        # Get the current batch from the PROCESSED documents list
        batch_docs = processed_docs[i:i + batch_size]
        print(f"  - Processing Batch {batch_number}/{num_batches} ({len(batch_docs)} chunks)...")

        if vector_store is None:
            vector_store = FAISS.from_documents(batch_docs, azure_embeddings)
            print("    - Initial FAISS index created.")
        else:
            vector_store.add_documents(batch_docs)
            print("    - Batch added to existing FAISS index.")

        end_time = time.time()
        print(f"  - Batch {batch_number} finished in {end_time - start_time:.2f} seconds.")

    print("\nAll batches have been processed and embedded.")
    vector_store.save_local("faiss_index_release_notes")
    print("Vector store saved to Colab's local directory: 'faiss_index_release_notes'")
else:
    print("No documents were processed. Skipping embedding process.")

Initializing Azure OpenAI Embeddings model...
Embedding model initialized.

Starting embedding process for 621 PROCESSED chunks in 1 batches...
  - Processing Batch 1/1 (621 chunks)...
    - Initial FAISS index created.
  - Batch 1 finished in 4.52 seconds.

All batches have been processed and embedded.
Vector store saved to Colab's local directory: 'faiss_index_release_notes'


## Initialize RAG System

### Initialize LLM

In [40]:
from langchain_openai import AzureChatOpenAI

# Ensure the vector_store was created before proceeding
if 'vector_store' in locals() and vector_store is not None:
    print("Initializing Azure Chat LLM...")
    llm = AzureChatOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        temperature=0.0, # Set to 0 for more factual, deterministic answers
        max_tokens=500
    )
    print("LLM initialized.")

    # Increase k from 4 to 8 to retrieve a wider range of documents.
    retriever = vector_store.as_retriever(search_kwargs={'k': 8})
    print("Retriever created (k=8).")
else:
    print("Vector store not available. Cannot initialize retriever and LLM.")

Initializing Azure Chat LLM...
LLM initialized.
Retriever created (k=8).


### RAG Chain

In [45]:
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# This prompt remains the same.
prompt_template = """
You are an intelligent assistant for querying software release notes.
Use only the following retrieved context to answer the user's question. If the question is in Indonesian, please answer in Indonesian.
If you don't have enough information from the context for a specific topic, state that clearly for that topic and answer the rest.
Do not make up information. Be concise and helpful.

Context:
{context}

Question:
{question}

Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Helper function to format the retrieved documents
def format_docs(docs):
    return "\n\n".join(f"Source: {doc.metadata.get('source', 'N/A')}\nDate: {doc.metadata.get('release_date', 'N/A')}\nContent: {doc.page_content}" for doc in docs)

rag_chain = (
    {
        "context": query_rewriter | retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("Final RAG chain with constrained rewriting created. Ready to answer questions.")

Final RAG chain with constrained rewriting created. Ready to answer questions.


## LLM Testing

In [46]:
if 'rag_chain' in locals() and 'prompt_template' in locals() and 'query_rewriter' in locals():
    print("RAG System initialized. You can now ask questions about the release notes.")
    print("Type 'quit' to exit.")

    while True:
        # --- Get user input for the query ---
        query = input("Enter your query: ")

        if query.lower() == 'quit':
            print("Exiting the query loop.")
            break

        print(f"Question: {query}")

        try:
            # Invoke the full chain
            answer = rag_chain.invoke(query)

            print("\nAnswer:")
            print(answer)
            print("-" * 50) # Separator for next query
        except Exception as e:
            print(f"An error occurred during processing: {e}")
            print("-" * 50) # Separator even on error

else:
    print("Cannot run tests because the RAG chain or required components were not created.")

RAG System initialized. You can now ask questions about the release notes.
Type 'quit' to exit.
Enter your query: Hi, what is the latest releases from SimpliDots and LangFlow?
Question: Hi, what is the latest releases from SimpliDots and LangFlow?

Answer:
The latest releases are:

**SimpliDOTS:**
- Pembaharuan SFA versi 3.2.7 - [26 March 2025]

**LangFlow:**
- Release on 2025-07-08, including multiple fixes such as bug fixes for simple flows with Loop, error width limitation, and updates to the Youtube Analysis template error message.
--------------------------------------------------
Enter your query: Can you explain what is the releases for?
Question: Can you explain what is the releases for?

Answer:
The releases mentioned in the context are updates or new versions of software tools and applications. Here is a summary of the releases:

1. **Langflow Releases**:
   - Versions such as 1.1.2, 1.1.3, 1.1.4, and 1.3.4 are updates to the Langflow software. However, the specific details o

In [47]:
if 'rag_chain' in locals() and 'prompt_template' in locals() and 'query_rewriter' in locals():
    print("RAG System initialized. You can now ask questions about the release notes.")
    print("Type 'quit' to exit.")

    while True:
        # --- Get user input for the query ---
        query = input("Enter your query: ")

        if query.lower() == 'quit':
            print("Exiting the query loop.")
            break

        print(f"Question: {query}")

        try:
            # Invoke the full chain
            answer = rag_chain.invoke(query)

            print("\nAnswer:")
            print(answer)
            print("-" * 50) # Separator for next query
        except Exception as e:
            print(f"An error occurred during processing: {e}")
            print("-" * 50) # Separator even on error

else:
    print("Cannot run tests because the RAG chain or required components were not created.")


RAG System initialized. You can now ask questions about the release notes.
Type 'quit' to exit.
Enter your query: Apa release terbaru dari Anthropic?
Question: Apa release terbaru dari Anthropic?

Answer:
Release terbaru dari Anthropic adalah pada tanggal 28 Juli 2025, di mana mereka merilis **text_editor_20250728**, sebuah alat editor teks yang diperbarui. Alat ini memperbaiki beberapa masalah dari versi sebelumnya dan menambahkan parameter opsional **max_characters** yang memungkinkan pengguna mengontrol panjang pemotongan saat melihat file besar.
--------------------------------------------------
Enter your query: Lalu apa aja release note mereka pada tahun 2025 ini?
Question: Lalu apa aja release note mereka pada tahun 2025 ini?

Answer:
Berikut adalah ringkasan release note dari tahun 2025:

**Anthropic API:**
1. **Juli 2025:**
   - **28 Juli:** Rilis *text_editor_20250728* dengan parameter opsional *max_characters* untuk mengontrol panjang pemotongan file besar.
   - **24 Juli:**