## Install Packages

In [1]:
!pip install -qU langchain langchain-openai langchain-community beautifulsoup4 faiss-cpu selenium python-dateutil

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!sudo apt-get update -y
!sudo apt-get install -y chromium-chromedriver
!sudo cp /usr/lib/chromium-browser/chromedriver /usr/bin

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,775 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,514 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packa

## Library

In [3]:
import os
import re
import math
import time
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date
from datetime import datetime
from dateutil.relativedelta import relativedelta
from langchain.docstore.document import Document
from google.colab import userdata
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, NoSuchElementException
from urllib.parse import urljoin, urldefrag
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

## Config. Environment

In [25]:
try:
    os.environ["AZURE_API_KEY"] = userdata.get('AZURE_OPENAI_API_KEY')
    os.environ["AZURE_API_BASE"] = userdata.get('AZURE_OPENAI_ENDPOINT')
    os.environ["AZURE_API_VERSION"] = userdata.get('OPENAI_API_VERSION')
    os.environ["AZURE_DEPLOYMENT_ID"] = userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
    os.environ["AZURE_EMBEDDING_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME') # Add this line
    os.environ["OPENAI_API_TYPE"] = 'azure' # Keep this to explicitly set the provider type for LiteLLM
    print("Azure credentials loaded successfully from Colab Secrets.")
except Exception as e:
    print(f"Could not load secrets. Please ensure you have added all required keys to the Colab Secrets manager. Error: {e}")

Azure credentials loaded successfully from Colab Secrets.


## Tools and Helpers

In [5]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata
    def __repr__(self):
        return f"Document(metadata={self.metadata})"

In [6]:
def extract_and_format_date(text):
    month_map = {'januari': 'january', 'februari': 'february', 'maret': 'march', 'april': 'april', 'mei': 'may', 'juni': 'june', 'juli': 'july', 'agustus': 'august', 'september': 'september', 'oktober': 'october', 'november': 'november', 'desember': 'december'}
    date_pattern = r"(?i)(\d{1,2}\s+(?:Jan(?:uari)?|Feb(?:ruari)?|Mar(?:et)?|Apr(?:il)?|Mei|Jun(?:i)?|Jul(?:i)?|Agu(?:stus)?|Sep(?:ember)?|Okt(?:ober)?|Nov(?:ember)?|Des(?:ember)?)\s+\d{4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?\s+\d{4})"
    match = re.search(date_pattern, text)
    if match:
        try:
            date_str = match.group(0).lower()
            for indo, eng in month_map.items(): date_str = date_str.replace(indo, eng)
            return parse_date(date_str)
        except (ValueError, TypeError): return None
    return None

In [7]:
def parse_last_updated(update_text):
    match = re.search(r'(\d+)\s+months? ago', update_text)
    if match:
        months_ago = int(match.group(1))
        return datetime.now() - relativedelta(months=months_ago)
    return None

In [8]:
def clean_text(text):
    text = re.sub(r'\\n\s*\\n', '\\n\\n', text)
    artifacts = ["Was this helpful?", "Powered by GitBook", "Copy", "Next", "Previous", "Last updated"]
    for artifact in artifacts: text = text.replace(artifact, "")
    return text.strip()

## Data Scraping

### GitHub Scraper

In [9]:
def scrape_github_releases(api_url):
    documents = []
    try:
        response = requests.get(f"{api_url}?per_page=15", timeout=15)
        response.raise_for_status()
        releases = response.json()
        for release in releases:
            content = f"## {release.get('name', 'Untitled Release')}\n\n{release.get('body', 'No description.')}"
            release_date = release.get('published_at', '')
            doc = Document(page_content=content, metadata={"source": "https://github.com/langflow-ai/langflow/releases", "release_date": release_date.split('T')[0] if release_date else 'unknown'})
            documents.append(doc)
        print(f"Scraped {len(documents)} documents from: Langflow")
        return documents
    except requests.RequestException as e:
        print(f"Error fetching GitHub releases from {api_url}: {e}")
        return []

### SimpliDots Selenium + Link Crawler Scraper

In [10]:
def scrape_simplidots_with_selenium(base_url):
    options = webdriver.ChromeOptions(); options.add_argument('--headless'); options.add_argument('--no-sandbox'); options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    all_documents = []
    try:
        print("Finding all unique article links on SimpliDots...")
        driver.get(base_url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
        links = driver.find_elements(By.XPATH, "//a[contains(@href, '/202')]")
        urls_to_visit = {link.get_attribute("href") for link in links if link.get_attribute("href")}
        print(f"Found {len(urls_to_visit)} potential article links. Now extracting content...")
        for url in urls_to_visit:
            try:
                driver.get(url)
                WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
                title = driver.title
                content_text = driver.find_element(By.TAG_NAME, "main").text.strip()
                page_source = driver.page_source
                release_date_str = 'unknown'
                date_obj = extract_and_format_date(title)
                if not date_obj: date_obj = extract_and_format_date(content_text)
                if not date_obj and "Last updated" in page_source:
                    footer_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Last updated')]")
                    if footer_elements: date_obj = parse_last_updated(footer_elements[0].text)
                if date_obj: release_date_str = date_obj.strftime('%Y-%m-%d')
                if len(content_text) > 100:
                    doc = Document(page_content=content_text, metadata={"source": url, "release_date": release_date_str})
                    all_documents.append(doc)
            except Exception as e:
                print(f"Warning: Could not process SimpliDots page {url}. Error: {e}")
    finally:
        driver.quit()
    print(f"Scraped {len(all_documents)} documents from: SimpliDots")
    return all_documents

### Anthropic Selenium Scraper

In [11]:
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def scrape_anthropic_with_selenium(url):
    """
    A highly resilient scraper for Anthropic using a 'wait then pause' strategy.
    """
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    driver = webdriver.Chrome(options=options)
    documents = []

    try:
        driver.get(url)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        print("Page has loaded, pausing for 5 seconds to let content settle...")
        time.sleep(5)

        # Now that the page is stable, parse the HTML.
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # The parsing logic remains the same as it was correct.
        date_headings = soup.find_all('h4')

        for heading in date_headings:
            date_obj = extract_and_format_date(heading.text)
            if date_obj:
                release_date_str = date_obj.strftime('%Y-%m-%d')
                content_node = heading.find_next_sibling('ul')
                if content_node:
                    content_text = content_node.get_text(separator='\\n', strip=True)
                    doc = Document(page_content=content_text, metadata={"source": url, "release_date": release_date_str})
                    documents.append(doc)

        print(f"Scraped {len(documents)} dated entries from: Anthropic")
        return documents

    except TimeoutException:
        print(f"Error: Timed out after 30 seconds. The site may be blocking automated access or is currently down.")
        return []
    finally:
        driver.quit()

### Execute Scraping

In [12]:
URLS = {
    "simplidots": "https://fitur-sap.simplidots.id/",
    "langflow": "https://api.github.com/repos/langflow-ai/langflow/releases",
    "anthropic": "https://docs.anthropic.com/en/release-notes/api"
}

print("Starting data scraping...")
all_documents = []
all_documents.extend(scrape_simplidots_with_selenium(URLS["simplidots"]))
all_documents.extend(scrape_github_releases(URLS["langflow"]))
all_documents.extend(scrape_anthropic_with_selenium(URLS["anthropic"]))
print(f"\\nScraping complete. Total documents found: {len(all_documents)}")

Starting data scraping...
Finding all unique article links on SimpliDots...
Found 57 potential article links. Now extracting content...
Scraped 57 documents from: SimpliDots
Scraped 15 documents from: Langflow
Page has loaded, pausing for 5 seconds to let content settle...
Scraped 40 dated entries from: Anthropic
\nScraping complete. Total documents found: 112


## Data Preprocessing

In [13]:
print("\nStarting data preprocessing (cleaning and chunking)...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
chunks = text_splitter.split_documents(all_documents)

processed_docs = []
for chunk in chunks:
    chunk.page_content = clean_text(chunk.page_content)
    if len(chunk.page_content) > 50:
        processed_docs.append(chunk)

print(f"Preprocessing complete. Total processed chunks: {len(processed_docs)}")


Starting data preprocessing (cleaning and chunking)...
Preprocessing complete. Total processed chunks: 542


## Data Ingestion

### Chunking Process

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
)

print("Chunking documents...")
chunked_docs = text_splitter.split_documents(all_documents)
print(f"Documents chunked successfully. Total chunks: {len(chunked_docs)}")

Chunking documents...
Documents chunked successfully. Total chunks: 559


### Data Embedding

In [15]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import math
import time

print("Initializing Azure OpenAI Embeddings model...")
azure_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
print("Embedding model initialized.")

batch_size = 1000
total_chunks = len(processed_docs)
vector_store = None

if total_chunks > 0:
    num_batches = math.ceil(total_chunks / batch_size)
    print(f"\nStarting embedding process for {total_chunks} PROCESSED chunks in {num_batches} batches...")

    for i in range(0, total_chunks, batch_size):
        batch_number = (i // batch_size) + 1
        start_time = time.time()

        # Get the current batch from the PROCESSED documents list
        batch_docs = processed_docs[i:i + batch_size]
        print(f"  - Processing Batch {batch_number}/{num_batches} ({len(batch_docs)} chunks)...")

        if vector_store is None:
            vector_store = FAISS.from_documents(batch_docs, azure_embeddings)
            print("    - Initial FAISS index created.")
        else:
            vector_store.add_documents(batch_docs)
            print("    - Batch added to existing FAISS index.")

        end_time = time.time()
        print(f"  - Batch {batch_number} finished in {end_time - start_time:.2f} seconds.")

    print("\nAll batches have been processed and embedded.")
    vector_store.save_local("faiss_index_release_notes")
    print("Vector store saved to Colab's local directory: 'faiss_index_release_notes'")
else:
    print("No documents were processed. Skipping embedding process.")

Initializing Azure OpenAI Embeddings model...
Embedding model initialized.

Starting embedding process for 542 PROCESSED chunks in 1 batches...
  - Processing Batch 1/1 (542 chunks)...
    - Initial FAISS index created.
  - Batch 1 finished in 19.58 seconds.

All batches have been processed and embedded.
Vector store saved to Colab's local directory: 'faiss_index_release_notes'


## Initialize RAG System

### Initialize LLM

In [21]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS

print("Initializing Azure OpenAI Embeddings model...")
azure_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

print("Creating FAISS vector store from processed documents...")
if 'processed_docs' in locals() and processed_docs:
    vector_store = FAISS.from_documents(processed_docs, azure_embeddings)
    retriever = vector_store.as_retriever(search_kwargs={'k': 12})

    print("Vector store and retriever created successfully with improved settings.")
else:
    print("No documents were processed. The Q&A bot will not have any knowledge.")

Initializing Azure OpenAI Embeddings model...
Creating FAISS vector store from processed documents...
Vector store and retriever created successfully with improved settings.


### RAG Chain

In [26]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_API_BASE"],
    azure_deployment=os.environ["AZURE_DEPLOYMENT_ID"],
    api_key=os.environ["AZURE_API_KEY"],
    api_version=os.environ["AZURE_API_VERSION"],
    model=f"azure/{userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')}",
    temperature=0.2
)

prompt_template = """
You are a highly intelligent and helpful product support specialist for SimpliDots, Langflow, and Anthropic.
Your task is to answer user questions based ONLY on the provided release note context below.

**Instructions:**
1.  **Synthesize Information:** Do not just list features. Combine information from the context to provide a clear, detailed, and easy-to-understand summary. Answer in full sentences and paragraphs.
2.  **Be Specific:** If the user asks for details about a specific feature or date, provide all the relevant information you can find in the context for that item.
3.  **Handle Missing Information:** If the context does not contain the answer to the question, you MUST explicitly say: "I could not find information on that topic in the provided release notes." Do not make up answers.
4.  **Language:** Always answer in the same language as the user's question.

**Context from Release Notes:**
---
{context}
---

**User's Question:**
{question}

**Your Detailed Answer:**
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

def format_docs(docs):
    # This function now adds a clear separator for the AI
    return "\n\n---\n\n".join(
        f"Document Source: {doc.metadata.get('source', 'N/A')}\n"
        f"Release Date: {doc.metadata.get('release_date', 'N/A')}\n\n"
        f"{doc.page_content}"
        for doc in docs
    )

# The chain definition remains the same, but it now uses the new prompt
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("Improved RAG Q&A Chain is ready.")

Improved RAG Q&A Chain is ready.


## LLM Testing

In [28]:
if 'rag_chain' in locals():
    print("RAG System initialized. You can now ask questions about the release notes.")
    print("Type 'quit' to exit.")

    while True:
        query = input("Enter your query: ")
        if query.lower() == 'quit':
            print("Exiting the query loop.")
            break
        print(f"Question: {query}")
        try:
            answer = rag_chain.invoke(query)
            print("\nAnswer:")
            print(answer)
            print("-" * 50)
        except Exception as e:
            print(f"An error occurred during processing: {e}")
            print("-" * 50)
else:
    print("Cannot run tests because the RAG chain was not created.")

RAG System initialized. You can now ask questions about the release notes.
Type 'quit' to exit.
Enter your query: Hello! Boleh tau ngga apa aja sih release terbaru dari SimpliDots?
Question: Hello! Boleh tau ngga apa aja sih release terbaru dari SimpliDots?

Answer:
Tentu! Berikut adalah ringkasan fitur terbaru yang dirilis oleh SimpliDOTS berdasarkan catatan yang tersedia:

1. **[Beta] Integrasi Sales Invoice SimpliDOTS x Accurate Online (17 Juli 2025)**  
   Fitur ini memungkinkan integrasi antara Sales Invoice di SimpliDOTS dengan Accurate Online, memberikan kemudahan bagi pengguna untuk mengelola faktur penjualan secara lebih efisien. Statusnya masih dalam tahap beta.

2. **Penambahan Fitur Collection (03 Juli 2025)**  
   Fitur ini ditambahkan untuk membantu pengguna dalam mengelola proses penagihan atau koleksi pembayaran, memberikan kontrol yang lebih baik terhadap transaksi keuangan.

3. **Live Mode dengan Opsi Reset atau Tidak Reset Data (31 Juli 2025)**  
   Fitur ini memberi