### Simple RAG with Thedu
> We will build a simple rag with thedu. Do not be deceived. We are doing a whole bunch of heavylifting under the hood with very little code.

In [1]:
from thedu import *
from fastcore.all import *
import re
from selectolax.parser import HTMLParser

#### Ingest a PDF document
> We will ingest a sample PDF document from Bruegel.
> We will read the PDF document using `read_pdf` function from thedu.ingest module.
> We will then scrape the urls from the pdf and then recursively get all pdf's off of those links and ingest them as well.'

In [2]:
@patch
def mk_dest(self:Path, add='_1', suffix=None, force=False):
    """Add a suffix to the file name before the extension."""
    if not self.exists(): return self
    self=self.parent/self.stem + add + ifnone(suffix, self.suffix)
    return self.mk_dest(add, suffix, force) if self.exists() and force else self

In [9]:
class BruegelDataset:
    ''' Dataset for Bruegel PDF documents.'''
    URL = 'https://www.bruegel.org/system/files/2024-06/Bruegel_factsheet_2024_0.pdf'
    URI_SCHEMA = r'^http://data\.europa\.eu/eli/(?P<typedoc>[^/]+)/(?P<year>\d{4})/(?P<natural_number>\d+)/(?P<date>\d{4}-\d{2}-\d{2})/(?P<lang>[a-z]{2,3})/pdfa2a$'
    def __init__(self, dest:Path=Path('bruegel_dataset')):
        self.dest = dest
        self.pdfs = self()


    @staticmethod
    def _is_pdf_link(l:str): return l.strip() and (l.lower().endswith('.pdf') or 'pdf' in l.lower())
    @staticmethod
    def url2name(url: str) -> str | None:
        if re.match(BruegelDataset.URI_SCHEMA, url):
            m = re.match(BruegelDataset.URI_SCHEMA, url)
            return f"{m['typedoc']}_{m['year']}_{m['natural_number']}_{m['date']}_{m['lang']}.pdf"
        return url.split('/')[-1] if url.split('/')[-1] != '' else url.rstrip('/').split('/')[-1]+'.html'

    def _get_meta(self, r):
        try:
            meta = HTMLParser(r).tags('meta')
            nodes = L([dict2obj(m.attributes) for m in meta]).filter(
                lambda m: 'about' in m
            ).filter(
                lambda m: ('property' in m and m['property'].lower() in ['eli:is_embodied_by','eli:title'])
            ).filter(
                lambda m: ('resource' in m and 'pdfa2a' in m['resource'].lower()) or 'content' in m
            ).groupby('about')
            for k in nodes: nodes[k] = merge(*nodes[k])
            return L([(nodes[n]['resource'], nodes[n]['content']) for n in nodes])
        except: pass


    def read_link(self, l, dest=None):
        try:
	        p = self.save_pdf(l, dest)
	        return p.read_text()
        except: return ''

    def save_pdf(self, l:str, dest:Path=None) -> Path | None:
        try:
            if not l : return None
            if not dest: dest = self.dest
            p = dest / self.url2name(l)
            if not (p.exists() and p.stat().st_size > 1024): p = urlsave(l,p)
            return p
        except Exception as ex: print(ex); return None

    def __call__(self) -> L:
        '''Make a dataset of documents from a pdf url and all linked pdfs.'''

        def get_linked_pdfs(doc, filter=True):
            links = doc.map(lambda p: p.links.map(lambda l: l.uri)).concat().unique()
            if filter: links = links.filter(self._is_pdf_link)
            return links

        pth = self.dest / self.url2name(self.URL)
        if not pth.exists(): pth = urlsave(self.URL, self.dest / self.url2name(self.URL))
        main_doc = read_pdf(pth)
        pdf_lp = Path(self.dest / 'pdf_links')
        if not pdf_lp.exists():
            links = get_linked_pdfs(main_doc, filter=False)
            print(f'Found {len(links)} linked pdfs.')
            all_c = parallel(self.read_link, links, threadpool=True, dest=self.dest/'links')
            pdf_links = parallel(self._get_meta, all_c, threadpool=True).concat().unique()
            print(f'Found {len(pdf_links)} external pdf links.')
            pdf_lp.mk_write('\n'.join([f'{l[0]},{l[1]}' for l in pdf_links]))
        url2tit = {l.split(',')[0]: l.split(',')[1] for l in pdf_lp.readlines()}
        pdf_list = L([l.split(',')[0] for l in pdf_lp.readlines()])
        name2url = {self.url2name(l): l for l in pdf_list}
        pdf_set = set(pdf_list.map(self.url2name))
        downloaded_pdfs = globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).filter(lambda m: m.stat().st_size > 1024).map(lambda p: p.name)
        fetch_list = [name2url.get(d) for d in pdf_set.difference(downloaded_pdfs)]
        if fetch_list:
            print(f'Downloading {len(fetch_list)} new pdfs...')
            parallel(self.save_pdf, fetch_list, dest=self.dest/'pdfs')
        return globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).map(lambda m: AttrDict(path=m, title=url2tit[name2url[m.name]]))

In [10]:
B = BruegelDataset()



Downloading 417 new pdfs...


  return _extra.JM_make_textpage_dict(tp, page_dict, raw)


In [None]:
from chonkie import RecursiveChunker

In [None]:
chunker = RecursiveChunker()

In [None]:
chunks = chunker('\n'.join(read_pdf(B.pdfs[0].path).map(lambda p: p.text_plain)))

In [None]:
for c in chunks[:3]: print(c[:500], '\n---\n')