## Install Packages

In [1]:
!pip install -qU crewai crewai-tools langchain-openai langchain-community beautifulsoup4 faiss-cpu selenium undetected-chromedriver

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Selenium

In [2]:
!sudo apt-get update -y
!sudo apt-get install -y chromium-chromedriver
!sudo cp /usr/lib/chromium-browser/chromedriver /usr/bin

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,161 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,775 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security

## Config. Environment

In [35]:
import os
from google.colab import userdata

try:
    os.environ["AZURE_API_KEY"] = userdata.get('AZURE_OPENAI_API_KEY')
    os.environ["AZURE_API_BASE"] = userdata.get('AZURE_OPENAI_ENDPOINT')
    os.environ["AZURE_API_VERSION"] = userdata.get('OPENAI_API_VERSION')
    os.environ["AZURE_DEPLOYMENT_ID"] = userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
    os.environ["AZURE_EMBEDDING_DEPLOYMENT_NAME"] = userdata.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME') # Add this line
    os.environ["OPENAI_API_TYPE"] = 'azure' # Keep this to explicitly set the provider type for LiteLLM
    EMAIL_ADDRESS = userdata.get('EMAIL_ADDRESS')
    EMAIL_PASSWORD = userdata.get('EMAIL_PASSWORD')

    if not all([os.environ.get("AZURE_OPENAI_API_KEY"), os.environ.get("AZURE_OPENAI_ENDPOINT"), os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"), EMAIL_ADDRESS, EMAIL_PASSWORD]):
        raise ValueError("One or more secrets are missing.")

    print("All secrets loaded successfully!")

except Exception as e:
    print(f"Error loading secrets: {e}. Please check the 'Secrets'.")

All secrets loaded successfully!


In [44]:
%%writefile recipients.csv
name,email
Aditya Bayhaqie,adityabayhaqie@gmail.com
Umar Bayhaqie,thisismebayhaqie@gmail.com
Jack Waltz,jackwaltz001@gmail.com

Overwriting recipients.csv


## Web Scraping

In [14]:
import requests
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, NoSuchElementException
from urllib.parse import urljoin, urldefrag
from dateutil.parser import parse as parse_date

In [15]:
# Document class to hold content and metadata
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata
    def __repr__(self):
        return f"Document(metadata={self.metadata})"

In [16]:
def scrape_github_releases(api_url):
    documents = []
    try:
        response = requests.get(f"{api_url}?per_page=15", timeout=15)
        response.raise_for_status()
        releases = response.json()
        for release in releases:
            content = f"## {release.get('name', 'Untitled Release')}\\n\\n{release.get('body', 'No description.')}"
            release_date = release.get('published_at', '')
            doc = Document(
                page_content=content,
                metadata={
                    "source": "https://github.com/langflow-ai/langflow/releases",
                    "release_date": release_date.split('T')[0] if release_date else 'unknown'
                }
            )
            documents.append(doc)
        return documents
    except requests.RequestException as e:
        print(f"Error fetching GitHub releases from {api_url}: {e}")
        return []

In [17]:
def scrape_simplidots_with_selenium(base_url, max_depth=2):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    links_to_crawl, crawled_links = {base_url}, set()

    for depth in range(max_depth):
        current_links = list(links_to_crawl - crawled_links)
        if not current_links: break
        for link in current_links:
            print(f"Finding links on (depth {depth+1}): {link}")
            crawled_links.add(link)
            try:
                driver.get(link)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@href]")))
                for _ in range(3): # Retry mechanism
                    try:
                        hrefs = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[@href]")]
                        for href in hrefs:
                            if href:
                                full_url = urldefrag(urljoin(link, href))[0]
                                if full_url.startswith(base_url): links_to_crawl.add(full_url)
                        break
                    except StaleElementReferenceException: time.sleep(0.5)
            except Exception as e: print(f"Could not process links on {link}: {e}")
    print(f"\\nFound {len(links_to_crawl)} unique links. Starting content extraction...")

    all_documents = []
    month_map = {'januari':'january', 'februari':'february', 'maret':'march', 'april':'april', 'mei':'may', 'juni':'june', 'juli':'july', 'agustus':'august', 'september':'september', 'oktober':'october', 'november':'november', 'desember':'december'}
    for url in links_to_crawl:
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
            full_title, release_date = driver.title, 'unknown'
            date_pattern = r"\\[?\\s*(\\d{1,2}\\s+[A-Za-z]+\\s+\\d{4})\\s*\\]?"
            match = re.search(date_pattern, full_title, re.IGNORECASE)
            if match:
                date_str = match.group(1).lower()
                for indo, eng in month_map.items(): date_str = date_str.replace(indo, eng)
                release_date = parse_date(date_str).strftime('%Y-%m-%d')
            title = re.sub(date_pattern, '', full_title, flags=re.IGNORECASE).split('|')[0].replace(' -', '').strip()
            if title in ["Fitur pada SMH (Sales Management Hub)", "Fitur pada Canvass", "Fitur pada RO (Route Optimization)", "Feature Updates Sales Automation Platform"] or len(title) < 10: continue
            content = driver.find_element(By.TAG_NAME, "main").text.strip()
            all_documents.append(Document(page_content=content, metadata={"source": url, "title": title, "release_date": release_date}))
            print(f"  -> Processed: {title} (Date: {release_date})")
        except Exception: continue
    driver.quit()
    return all_documents

In [18]:
def scrape_anthropic_with_selenium(url):
    options = webdriver.ChromeOptions(); options.add_argument('--headless'); options.add_argument('--no-sandbox'); options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h4")))
        return [Document(page_content=driver.find_element(By.TAG_NAME, "body").text, metadata={"source": url})]
    except Exception as e:
        print(f"Error using Selenium for Anthropic: {e}")
        return []
    finally:
        driver.quit()

In [19]:
URLS = {
    "simplidots": "https://fitur-sap.simplidots.id/",
    "langflow": "https://api.github.com/repos/langflow-ai/langflow/releases",
    "anthropic": "https://docs.anthropic.com/en/release-notes/api"
}
print("Starting data scraping...")
all_documents = []
all_documents.extend(scrape_simplidots_with_selenium(URLS["simplidots"]))
all_documents.extend(scrape_github_releases(URLS["langflow"]))
all_documents.extend(scrape_anthropic_with_selenium(URLS["anthropic"]))
print(f"\\nScraping complete. Total documents found: {len(all_documents)}")

Starting data scraping...
Finding links on (depth 1): https://fitur-sap.simplidots.id/
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2024/penambahan-fitur-generate-invoice-pada-sales-order-dan-fitur-cancel-si-24-july-2024
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2024/penambahan-open-api-sap-simplidots-31-july-2024
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2024/penambahan-fitur-print-dokumen-27-aug-2024
Finding links on (depth 2): https://fitur-sap.simplidots.id/sfa/fitur-pada-sfa-sales-force-automation
Finding links on (depth 2): https://fitur-sap.simplidots.id/canvass/fitur-pada-canvass/penambahan-dan-perbaikan-fitur-pada-canvass-versi-1.2.1-03-july-2023
Finding links on (depth 2): https://fitur-sap.simplidots.id/smh/fitur-pada-smh-sales-management-hub/2024/penambahan-dan-pembaharuan-pada-menu-sales-invoice-2-sept-20

## Advanced Preprocessing

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def clean_text(text):
    text = re.sub(r'\\n\\s*\\n', '\\n\\n', text)
    artifacts = ["Was this helpful?", "Powered by GitBook", "Copy", "Next", "Previous", "Last updated"]
    for artifact in artifacts:
        text = text.replace(artifact, "")
    return text.strip()

In [21]:
def extract_and_format_date(text):
    month_map = {'januari': 'january', 'februari': 'february', 'maret': 'march', 'april': 'april', 'mei': 'may', 'juni': 'june', 'juli': 'july', 'agustus': 'august', 'september': 'september', 'oktober': 'october', 'november': 'november', 'desember': 'december'}
    date_pattern = r"(?i)(\\d{1,2}\\s+(?:Jan(?:uari)?|Feb(?:ruari)?|Mar(?:et)?|Apr(?:il)?|Mei|Jun(?:i)?|Jul(?:i)?|Agu(?:stus)?|Sep(?:tember)?|Okt(?:ober)?|Nov(?:ember)?|Des(?:ember)?)\\s+\\d{4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\\s+\\d{1,2}(?:st|nd|rd|th)?(?:,)?\\s+\\d{4})"
    match = re.search(date_pattern, text)
    if match:
        try:
            date_str = match.group(0).lower()
            for indo, eng in month_map.items():
                date_str = date_str.replace(indo, eng)
            return parse_date(date_str)
        except (ValueError, TypeError): return None
    return None

In [22]:
# Execute Preprocessing
print(" Starting advanced data preprocessing...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
final_chunks = text_splitter.split_documents(all_documents)

for chunk in final_chunks:
    chunk.page_content = clean_text(chunk.page_content)
    if 'release_date' not in chunk.metadata or chunk.metadata['release_date'] == 'unknown':
        extracted_date = extract_and_format_date(chunk.page_content)
        chunk.metadata['release_date'] = extracted_date.strftime('%Y-%m-%d') if extracted_date else 'unknown'

processed_docs = [chunk for chunk in final_chunks if len(chunk.page_content) > 50]
print(f" Preprocessing complete. Total processed chunks: {len(processed_docs)}")

 Starting advanced data preprocessing...
 Preprocessing complete. Total processed chunks: 217


## Filter for Recent Updates

In [33]:
from datetime import datetime, timedelta

seven_days_ago = datetime.now() - timedelta(days=21)
recent_docs = []
for doc in processed_docs:
    release_date_str = doc.metadata.get('release_date')
    if release_date_str and release_date_str != 'unknown':
        try:
            if datetime.strptime(release_date_str, '%Y-%m-%d') >= seven_days_ago:
                recent_docs.append(doc)
        except ValueError:
            continue

print(f"Found {len(recent_docs)} documents from the last 21 days.")

if recent_docs:
    newsletter_context = "\\n\\n---\\n\\n".join(
        f"Source: {doc.metadata.get('source', 'N/A')}\\n"
        f"Date: {doc.metadata.get('release_date', 'N/A')}\\n\\n"
        f"{doc.page_content}"
        for doc in recent_docs
    )
else:
    newsletter_context = "No new release notes found in the last 7 days."

Found 4 documents from the last 21 days.


## Configure LLM and Tools

In [36]:
import csv
import smtplib
import time
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from langchain_openai import AzureChatOpenAI
from crewai.tools import BaseTool

try:
    llm = AzureChatOpenAI(
        azure_endpoint=os.environ["AZURE_API_BASE"],
        azure_deployment=os.environ["AZURE_DEPLOYMENT_ID"],
        api_key=os.environ["AZURE_API_KEY"],
        api_version=os.environ["AZURE_API_VERSION"],
        model=f"azure/{userdata.get('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')}"
    )
    print("Azure LLM initialized successfully.")
except Exception as e:
    print(f"Error initializing Azure LLM: {e}")
    print("Please ensure your Azure OpenAI credentials are set correctly in the cell above.")

Azure LLM initialized successfully.


In [46]:
import csv
import smtplib
import time
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from langchain_openai import AzureChatOpenAI
from crewai.tools import BaseTool

# Updated Email Tool with Full HTML Template
class CSVEmailTool(BaseTool):
    name: str = "Personalized HTML Email Dispatcher"
    description: str = "Reads 'recipients.csv' and sends a fully formatted, personalized HTML email to everyone on the list. The input is the core HTML content for the newsletter body."

    def _run(self, newsletter_body_html: str) -> str:
        sender_email = EMAIL_ADDRESS
        sender_password = EMAIL_PASSWORD
        sent_count = 0
        recipient_list = []

        try:
            with open('recipients.csv', mode='r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    recipient_list.append(row)
            if not recipient_list:
                return "Error: recipients.csv is empty or not found."
        except FileNotFoundError:
            return "Error: recipients.csv not found. Please create it first."
        except Exception as e:
            return f"Error reading CSV file: {e}"

        for recipient in recipient_list:
            recipient_name = recipient.get("name", "there")
            recipient_email = recipient.get("email")

            if not recipient_email:
                continue

            # Create the full HTML document
            full_html_content = f"""
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1.0">
                <style>
                    body {{ font-family: sans-serif; line-height: 1.6; color: #333; }}
                    .container {{ max-width: 600px; margin: 20px auto; padding: 20px; border: 1px solid #ddd; border-radius: 8px; }}
                    h2 {{ color: #0056b3; }}
                    ul {{ padding-left: 20px; }}
                    li {{ margin-bottom: 10px; }}
                    .footer {{ margin-top: 20px; font-size: 0.8em; color: #888; text-align: center; }}
                </style>
            </head>
            <body>
                <div class="container">
                    <p>Hi {recipient_name},</p>
                    <p>Here are the latest updates for this week:</p>
                    {newsletter_body_html}
                    <hr>
                    <p class="footer">To unsubscribe, please reply to this email.</p>
                </div>
            </body>
            </html>
            """

            message = MIMEMultipart()
            message['From'] = f"SimpliDOTS Tech Updates <{sender_email}>"
            message['To'] = recipient_email
            message['Subject'] = "Weekly Tech Release Notes Digest"

            # Attach the body with the correct subtype
            message.attach(MIMEText(full_html_content, 'html'))

            try:
                print(f"Sending email to {recipient_name} at {recipient_email}...")
                server = smtplib.SMTP('smtp.gmail.com', 587)
                server.starttls()
                server.login(sender_email, sender_password)
                server.sendmail(sender_email, recipient_email, message.as_string())
                server.quit()
                sent_count += 1
                time.sleep(2)
            except Exception as e:
                print(f"❌ Failed to send email to {recipient_email}. Error: {e}")

        return f"Successfully sent personalized HTML emails to {sent_count}/{len(recipient_list)} recipients."

# Instantiate the final tool
email_tool = CSVEmailTool()

## Define Agents

In [47]:
from crewai import Agent, Task

analyst_agent = Agent(role="Principal Technology Analyst", goal="Analyze provided release notes to identify critical updates.", backstory="You are an expert analyst who extracts impactful information.", llm=llm, verbose=True)

expert_agent = Agent(role="Expert Tech Newsletter Writer", goal="Craft an engaging newsletter from an analyst's report.", backstory="You are a famous tech writer known for making complex topics exciting.", llm=llm, verbose=True)

dispatcher_agent = Agent(
    role="Communications Dispatch Officer",
    goal="Use the email tool to send the newsletter to all recipients defined in the system's data file.",
    backstory="You are a reliable specialist ensuring important updates are dispatched correctly.",
    tools=[email_tool],
    llm=llm,
    verbose=True
)

analysis_task = Task(
    description=(
        "Analyze the following text which contains software release notes from the last week. "
        "Identify and list all significant new features and announcements. "
        "Focus on what would be most relevant to a software development team.\n\n"
        "HERE IS THE TEXT TO ANALYZE:\n"
        "---"
        "{context}"
        "---"
    ),
    expected_output="A bullet-point list summarizing key updates, categorized by product. This will be the main body of the newsletter.",
    agent=analyst_agent
)

summarization_task = Task(
    description=(
        "Take the analyst's report and transform it into a polished, professional HTML newsletter body. "
        "Use <h2> tags for main sections (e.g., '🚀 New Features', '🐞 Bug Fixes'), "
        "and an unordered list (<ul> with <li> items) for the details in each section. "
        "Make the title of each list item bold using <strong> tags. "
        "Do NOT include the <html>, <head>, or <body> tags, only the content that goes inside the body."
    ),
    expected_output=(
        "A string containing the well-formatted HTML for the newsletter body, starting with an <h2> tag."
    ),
    agent=expert_agent,
    context=[analysis_task]
)

email_task = Task(
    description="Take the composed newsletter body and use the Personalized Email Dispatcher tool. The tool will automatically find the recipients in the CSV file and send the emails.",
    expected_output="A confirmation message stating how many emails were successfully sent.",
    agent=dispatcher_agent,
    context=[summarization_task]
)

## Assemble the Crew

In [48]:
from crewai import Crew, Process

release_notes_crew = Crew(
    agents=[analyst_agent, expert_agent, dispatcher_agent],
    tasks=[analysis_task, summarization_task, email_task],
    process=Process.sequential,
    verbose=True
)

# Kick off the crew's work
print("Kicking off the CSV-Powered Release Notes Crew...")
if newsletter_context != "No new release notes found in the last 21 days.":
    try:
        result = release_notes_crew.kickoff(inputs={'context': newsletter_context})
        print("\\n\\nCrew execution finished successfully!")
        print("\\nFinal Result:")
        print(result)
    except Exception as e:
        print(f"\\n\\nAn error occurred during crew execution: {e}")
else:
    print("No recent documents found to process. The crew will not run.")

Kicking off the CSV-Powered Release Notes Crew...


Output()

Output()

Output()

Output()

\n\nCrew execution finished successfully!
\nFinal Result:
Successfully sent personalized HTML emails to 3/3 recipients.
