# Summarizer Proof of Concept

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import hashlib
import httpx
import json
import time
from datetime import datetime, timedelta
from pathlib import Path

import cleanurl
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm
from bs4 import BeautifulSoup


In [3]:
started_at = time.time()

In [4]:
nb_path = Path()

In [5]:
load_dotenv(nb_path / "../.env", verbose=True)

True

In [6]:
# document identifier is needed for reference to vector DB
def generate_document_id(url: str | None, content: str | None) -> str:
    """Generate unique document identifier based on URL or cleaned content"""
    MAX_CHARS = 240
    
    # if that's a URL, clean it from trackers
    # if no, make lowercase
    if url:
        cleaned = cleanurl.cleanurl(url.strip())
        # strip schema
        cleaned = cleaned.schemeless_url
    elif content:
        cleaned = content.strip().lower()
    else:
        raise ValueError("URL and Content is empty")
    
    truncated = cleaned[:MAX_CHARS]
    hash_object = hashlib.sha1(truncated.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:10]
    return document_id

## Load data

In [7]:
from readability import Document

class ContentIsVideoError(Exception):
    pass

def fetch_url_content(client, url, truncate_words: int = 500) -> str:
    if "youtube.com" in url or "youtu.be" in url:
        raise ContentIsVideoError("YouTube video content is not available.")

    response = client.get(url, follow_redirects=True)
    # soup = BeautifulSoup(response.text, 'html.parser')
    
    doc = Document(response.content)
    soup = BeautifulSoup(doc.summary(), 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Get text
    text = soup.get_text()
    
    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # Drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    # Truncate to first 500 words
    if truncate_words:
        words = text.split()[:truncate_words]

        return ' '.join(words)
    
    return text

In [8]:
def fetch_meta_description(client, url):
    try:
        response = client.get(url, follow_redirects=True, timeout=10.0)
        soup = BeautifulSoup(response.text, 'html.parser')
        meta_desc = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
        return meta_desc['content'] if meta_desc else "No description available."
    except Exception as e:
        return f"Error fetching meta description: {str(e)}"

In [9]:
def fetch_wayback_content(client, url, truncate_words: int = 500):
    try:
        if "youtube.com" in url or "youtu.be" in url:
            raise ContentIsVideoError("YouTube video content is not available.")
        
        # First, we need to get the latest snapshot from Wayback Machine
        wb_url = f"http://archive.org/wayback/available?url={url}"
        response = client.get(wb_url, follow_redirects=True)
        data = response.json()
        
        if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
            snapshot_url = data["archived_snapshots"]["closest"]["url"]
            
            # Now fetch the content from the snapshot
            response = client.get(snapshot_url)
            # print(response.status_code)
            # print(response.is_error)
            # print(response.text)
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Get text
            text = soup.get_text()
            
            # Break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            
            # Drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            
            # Truncate to N words
            words = text.split()[:truncate_words]
            return ' '.join(words)
        else:
            print("No Wayback Machine snapshot available.")
        
            return None
    except Exception as e:
        print(f"Error fetching Wayback content: {str(e)}")
    
        return None


In [10]:
def fetch_hn_comments(client, item_id, max_comments=10):
    url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
    response = client.get(url)
    item_data = response.json()
    
    comments = []
    if "kids" in item_data:
        for comment_id in item_data["kids"][:max_comments]:
            comment_url = f"https://hacker-news.firebaseio.com/v0/item/{comment_id}.json"
            comment_response = client.get(comment_url)
            comment_data = comment_response.json()
            if comment_data.get("text"):
                comments.append({
                    "author": comment_data.get("by", "Anonymous"),
                    "text": comment_data["text"],
                    "time": datetime.fromtimestamp(comment_data["time"]).isoformat()
                })
    
    return comments

def fetch_lobsters_comments(client, short_id, max_comments=10):
    url = f"https://lobste.rs/s/{short_id}.json"
    response = client.get(url)
    story_data = response.json()
    
    comments = []
    for comment in story_data.get("comments", [])[:max_comments]:
        comments.append({
            "author": comment.get("commenting_user", {}),
            "text": comment["comment"],
            "time": comment["created_at"]
        })
    
    return comments

In [11]:
hn_dump_file = "hn_news.json"
lr_dump_file = "lr_news.json"

In [12]:
def get_stories_to_download(stories: list, source: str) -> list:
    """Get list of stories for a source that need to be downloaded
    
    Absolutely dumb implementation.
    """
    stories_to_download = []
    
    if source == "Hacker News":
        story_ids = [str(story_id) for story_id in stories]
        try:
            with open(hn_dump_file, "r") as fp:
                stored_news = json.load(fp)
                    
        except (FileNotFoundError, json.JSONDecodeError):
            stored_news = []
            
    elif source == "Lobsters":
        story_ids = [str(story["short_id"]) for story in stories]
        try:
            with open(lr_dump_file, "r") as fp:
                stored_news = json.load(fp)
        except (FileNotFoundError, json.JSONDecodeError):
            stored_news = []
        
    else:
        raise ValueError("Unknown source")
    
    stored_ids = [news_item["original_id"] for news_item in stored_news]
    
    for story_id in story_ids:
        if story_id not in stored_ids:
            stories_to_download.append(story_id)
            
    return stories_to_download
    

In [13]:
def fetch_hacker_news(
        scope: str, 
        count: int = 10,
        max_comments: int = 10,
        truncate_words: int = 1000,) -> list[dict]:
    with httpx.Client() as client:
        if scope == "hottest":
            url = "https://hacker-news.firebaseio.com/v0/topstories.json"
        elif scope == "newest":
            url = "https://hacker-news.firebaseio.com/v0/newstories.json"
        else:
            raise ValueError(f"Unknown scope: {scope}")

        response = client.get(url)
        stories = response.json()[:count]  # Get top n stories
        
        stories_to_download = get_stories_to_download(stories, source="Hacker News") 
        
        news_items = []
        for story_id in stories_to_download:
            story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
            story_response = client.get(story_url)
            story_data = story_response.json()
            
            item_url = story_data.get("url", f"https://news.ycombinator.com/item?id={story_id}")
            
            meta_description = fetch_meta_description(client, item_url)
            
            content = story_data.get("text")
            if not content and item_url:
                try:
                    content = fetch_wayback_content(client, item_url, truncate_words=truncate_words)
                    if not content:
                        content = fetch_url_content(client, item_url, truncate_words=truncate_words)
                except ContentIsVideoError:
                    pass
                except Exception as exc:
                    # print(f"Error fetching content for {item_url}: {str(exc)}, fallback to Wayback Machine.")
                    pass 
                
                if not content:
                    content = story_data.get("title")
            
            comments = fetch_hn_comments(client, story_id, max_comments)
            
            news_items.append({
                "original_id": story_id,
                "title": story_data["title"],
                "url": item_url,
                "score": story_data["score"],
                "timestamp": datetime.fromtimestamp(story_data["time"]).isoformat(),
                "source": "Hacker News",
                "content": content,
                "comments": comments,
                "description": meta_description,
                "document_uid": generate_document_id(item_url, content),
            })
        
        return news_items


In [14]:
def fetch_lobsters_comments_and_content(client, short_id, max_comments=10):
    url = f"https://lobste.rs/s/{short_id}.json"
    response = client.get(url)
    story_data = response.json()
    
    comments = []
    for comment in story_data.get("comments", [])[:max_comments]:
        comments.append({
            "author": comment.get("commenting_user", {}),
            "text": comment["comment"],
            "time": comment["created_at"]
        })
    
    # Check if it's a text post and extract the content
    content = None
    if story_data.get("url") == f"https://lobste.rs/s/{short_id}":
        content = story_data.get("description", "")
        # Convert HTML to plain text
        soup = BeautifulSoup(content, 'html.parser')
        content = soup.get_text()
    
    return comments, content

In [15]:
def fetch_lobsters_news(
        scope: str, 
        count: int = 10,
        max_comments: int = 10,
        truncate_words: int = 1000) -> list[dict]:
    with httpx.Client() as client:
        if scope == "hottest":
            url = "https://lobste.rs/hottest.json"
        elif scope == "newest":
            url = "https://lobste.rs/newest.json"
        else:
            raise ValueError(f"Unknown scope: {scope}")
        
        response = client.get(url)
        stories = response.json()[:count]  # Get top `count` stories
        
        stories_to_download = get_stories_to_download(stories, source="Lobsters") 
        
        news_items = []
        for story in stories:
            if str(story["short_id"]) not in stories_to_download:
                continue
            
            content = None
            meta_description = story.get("description", "")
            
            if meta_description:
                # this is not an URL, but a text post
                content = meta_description
            else:
                meta_description = fetch_meta_description(client, story["url"])
            
            # comments, content = fetch_lobsters_comments_and_content(client, story["short_id"])
            
            comments = fetch_lobsters_comments(client, story["short_id"], max_comments)
            
            if not content and story["url"]:
                try:
                    content = fetch_wayback_content(client, story["url"], truncate_words=truncate_words)
                    if not content:
                        content = fetch_url_content(client, story["url"], truncate_words=truncate_words)
                except ContentIsVideoError:
                    pass
                except Exception as exc:
                    print(f"Error fetching content for {story['url']}: {str(exc)}, fallback to Wayback Machine.")
            
            if not content:
                content = story["title"]
            # comments = fetch_lobsters_comments(client, story["short_id"])
            
            news_items.append({
                "original_id": story["short_id"],
                "title": story["title"],
                "url": story["url"],
                "score": story["score"],
                "timestamp": story["created_at"],
                "source": "Lobsters",
                "content": content,
                "comments": comments,
                "description": meta_description,
                "document_uid": generate_document_id(story["url"], content),
            })
        
        return news_items

If files with data do not exist, fetch initial big date dump.
If they exist, get smaller chunks.

In [16]:
INIT_CHUNK_SIZE = 100
ONGOING_CHUNK_SIZE = 20

In [17]:
def load_stored(file_path: str) -> list:
    """Load stored dumps"""
    stored = []
    try:
        with open(file_path, "r") as fp:
            stored = json.load(fp)
    except (FileNotFoundError, json.JSONDecodeError):
        pass
    
    return stored

stored_hn = load_stored(hn_dump_file)
stored_lr = load_stored(lr_dump_file)

In [18]:
def append_news(news: list[dict], dump_file: str) -> int:
    """Append news to the dumped news in a file
    
    Returns:
        int number of news items stored
    """
    stored = load_stored(dump_file)

    if isinstance(stored, list):
        stored.extend(news)
    else:
        stored = news[:]

    with open(dump_file, "w") as fp:
        # add timestamp if not exists
        for doc in stored:
            if "ingest_utctime" not in doc:
                doc["ingest_utctime"] = int(time.time())

        json.dump(stored, fp, indent=2)
        
    return len(stored)

In [19]:
is_first_run = True
if stored_hn and stored_lr:
    is_first_run = False

In [20]:
hn_news = fetch_hacker_news(
    scope="newest", 
    count=INIT_CHUNK_SIZE if not stored_hn else ONGOING_CHUNK_SIZE
)

print(f"Number of downloaded HN news: {len(hn_news)}")

append_news(hn_news, hn_dump_file)

Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: YouTube video content is not available.
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expectin

220

In [21]:
lr_news = fetch_lobsters_news(
    scope="newest", 
    count=INIT_CHUNK_SIZE if not stored_lr else ONGOING_CHUNK_SIZE
)

print(f"Number of downloaded LR news: {len(lr_news)}")

append_news(lr_news, lr_dump_file)


Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: YouTube video content is not available.
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Error fetching Wayback content: Expecting value: line 1 column 1 (char 0)
Number of downloaded LR news: 8


70

## Prepare data

In [22]:
if is_first_run:
    stored_hn = load_stored(hn_dump_file)
    stored_lr = load_stored(lr_dump_file)
    
    documents = stored_hn + stored_lr
    
else:
    documents = hn_news + lr_news

In [23]:
len(hn_news), len(lr_news), len(documents)

(20, 8, 28)

## Derive embeddings

In [24]:
import embeddings



In [25]:
embedding_dim = embeddings.get_dimensions()
embedding_dim

768

In [26]:
len(documents)

28

In [27]:
# create embeddings using the pre-trained model
operations = []
for doc in tqdm(documents, desc="Creating embeddings"):
    # Transforming text into an embedding using the model
    doc["vector"] = embeddings.get_embeddings(doc["content"])
    operations.append(doc)
    
operations[0]

Creating embeddings:   0%|          | 0/28 [00:00<?, ?it/s]

{'original_id': '41983217',
 'title': "Internet.nl's Website Connection Checker",
 'url': 'https://internet.nl/',
 'score': 1,
 'timestamp': '2024-10-29T13:32:51',
 'source': 'Hacker News',
 'content': '//matomo.internet.nl/ 1 *.internet.nl Home Modern Internet Standards provide for more reliability and further growth of the Internet. Are you using them? Test your website Modern address? Signed domain? Secure connection? Route authorisation? About the test Test your email Modern address? Anti-phishing? Secure transport? Route authorisation? About the test Test your connection Modern addresses reachable? Domain signatures validated? About the test',
 'comments': [],
 'description': 'Test for modern Internet Standards IPv6, DNSSEC, HTTPS, HSTS, DMARC, DKIM, SPF, STARTTLS, DANE, RPKI and security.txt',
 'document_uid': '5d02412e26',
 'ingest_utctime': 1730205416,
 'vector': [0.024734877049922943,
  -0.07642491906881332,
  0.0172008965164423,
  -0.018484903499484062,
  0.02280009537935257,

In [28]:
len(operations)

28

## Store embeddings

In [29]:
MAX_CONTENT_VECTORIZED = 1000 * 5 # take approx all the saved text
collection_name = "llm_summarizer_poc"
collection_db_path = "./milvus_summarizer.db"

In [30]:
from vectordb import MilvusClientFix, create_schema, create_index_params

milvus_client = MilvusClientFix.get_instance(collection_db_path)


In [31]:
if is_first_run and milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [32]:

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    schema=create_schema("LLM Summarizer PoC", embedding_dim, MAX_CONTENT_VECTORIZED),
    index_params=create_index_params(milvus_client),
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)

In [33]:
# refactor operations to match the schema
operations = [
    {
        "document_uid": doc["document_uid"],
        "text": doc["content"],
        "ingest_utctime": doc["ingest_utctime"],
        "vector": doc["vector"]
    }
    for doc in operations
]

In [34]:
milvus_client.insert(collection_name=collection_name, data=operations)

{'insert_count': 28, 'ids': ['5d02412e26', '0f55c8f9d5', 'c1e854bf8b', '2a53d15402', 'dfda474c05', '07bfb36abf', 'a12f183a95', 'f68851a1a3', '7009f88d74', '05f9ee8312', '25a5677797', '46157be0e5', 'ec824f74e6', '2864ecf2bc', '54c8c460ec', '258f43d72b', 'c69ecb53ea', '5b9c8a5415', '3f43861a6e', '371a65e5b3', '983e766483', 'eabc60d283', '65c0f20f8e', '9af020bd53', '5b864236bb', '29bbc787c6', '971589a11c', '6e93c4728c'], 'cost': 0}

In [35]:
milvus_client.close()

## Complete the ingestion and provide stats

In [36]:
stored_hn = load_stored(hn_dump_file)
stored_lr = load_stored(lr_dump_file)

In [37]:
completed_at = time.time()

time_spent = (completed_at - started_at)

print(f"Completed at {datetime.now()}, execution took ~{int(time_spent / 60)} min")
print(f"Number of stored HN entries: {len(stored_hn)}")
print(f"Number of stored Lobste.rs entries: {len(stored_lr)}")

Completed at 2024-10-29 13:37:26.982016, execution took ~1 min
Number of stored HN entries: 220
Number of stored Lobste.rs entries: 70
