In [7]:
import typing

from bs4 import BeautifulSoup
from icecream import ic
import requests
import spacy

In [8]:
CHUNK_SIZE: int = 1024

SPACY_MODEL: str = "en_core_web_md"

nlp: spacy.Language = spacy.load(SPACY_MODEL)

What is the ideal text chunk size? 
See <https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5>

In [11]:
def make_chunk (
    doc: spacy.tokens.doc.Doc,
    chunk_id: int,
    ) -> int:
    """
Split the given document into text chunks, returning the last index.
    """
    chunks: typing.List[ str ] = []
    chunk_total: int = 0
    prev_line: str = ""

    for sent_id, sent in enumerate(doc.sents):
        line: str = str(sent).strip()
        line_len: int = len(line)
    
        if (chunk_total + line_len) > CHUNK_SIZE:
            # emit current chunk
            print("--- chunk_id: ", chunk_id)
            print("\n".join(chunks))
            print()

            # make a new chunk
            chunks = [ prev_line, line ]
            chunk_total = len(prev_line) + line_len
            chunk_id += 1
        else:
            # append to current chunk
            chunks.append(line)
            chunk_total += line_len

        prev_line = line

    # emit last chunk
    print("--- chunk_id: ", chunk_id)
    print("\n".join(chunks))

    return chunk_id + 1

In [12]:
chunk_id: int = 0

url_list: typing.List[ str ] = [
    "https://www.theguardian.com/society/article/2024/jul/31/eating-processed-red-meat-could-increase-risk-of-dementia-study-finds",
]


for url in url_list:
    response: requests.Response = requests.get(url)
    soup: BeautifulSoup = BeautifulSoup(response.text)

    doc: spacy.tokens.doc.Doc = nlp("\n".join([
        para.text.strip()
        for para in soup.findAll("p")
    ]))

    chunk_id = make_chunk(doc, chunk_id)

--- chunk_id:  0
US researchers say they have uncovered potential link after tracking 130,000 people over four decades
Eating processed red meat could be a significant risk factor for dementia, according to a large study that tracked more than 100,000 people over four decades.
Processed red meat has previously been shown to increase the risk of cancer, heart disease and type 2 diabetes.
Now US researchers say they have uncovered a potential link to dementia.
The study also found that replacing processed red meat with healthier foods such as nuts, beans or tofu could help reduce the risk of dementia.
The findings were presented at the Alzheimer’s Association international conference in the US.
The number of people living with dementia globally is forecast to nearly triple to 153 million by 2050, and studies looking at diet and risk of cognitive decline has become a focus of researchers.
In the latest research, experts studied the health of 130,000 nurses and other health workers working