In [488]:
from langchain.schema import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import re
from langchain.text_splitter import CharacterTextSplitter
import warnings
import pdfplumber
from PyPDF2 import PdfReader
warnings.filterwarnings("ignore")

In [489]:
pdf_path = "../data/Study permit_ Get the right documents - Canada.ca.pdf"

In [515]:
reader = PdfReader(pdf_path)
# Loop through pages and extract hyperlink metadata
for page_number, page in enumerate(reader.pages, start=1):
    if "/Annots" in page:
        for annot in page["/Annots"]:
            annotation = annot.get_object()
            if "/A" in annotation and "/URI" in annotation["/A"]:
                # Extract hyperlink URL
                uri = annotation["/A"]["/URI"]
                # Extract bounding rectangle
                rect = annotation.get("/Rect", "N/A")
                
                print(f"Page {page_number}: Hyperlink: {uri}, Rect: {rect}")

Page 1: Hyperlink: https://www.canada.ca/en.html, Rect: [41.25, 677.25, 99.75, 701.25]
Page 1: Hyperlink: https://www.canada.ca/en/services/immigration-citizenship.html, Rect: [117, 677.25, 275.25, 701.25]
Page 1: Hyperlink: https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada.html, Rect: [292.5, 677.25, 539.25, 701.25]
Page 1: Hyperlink: https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit.html, Rect: [52.5, 645, 125.25, 669]
Page 1: Hyperlink: https://www.canada.ca/en.html, Rect: [39.75, 721.5, 434.25, 756]
Page 1: Hyperlink: https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/eligibility.html#gc-document-nav, Rect: [65.25, 513.75, 570, 534]
Page 1: Hyperlink: https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/get-documents/proof-of-acceptance.html#gc-document-nav, Rect: [90.75, 442.5, 570, 462.75]
Page 1: Hyperlink: https://www.canada.

In [514]:
with pdfplumber.open(pdf_path) as pdf:
    sample = []
    
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        text = page.extract_text_lines()
        sample.append(text)
        
sample

[[{'text': '1/22/25, 7:09 PM Study permit: Get the right documents - Canada.ca',
   'x0': 23.999999,
   'top': 16.199252352944086,
   'x1': 441.6563654225279,
   'bottom': 24.194251869819027,
   'chars': [{'matrix': (0.74999996875,
      0.0,
      0.0,
      0.74999996875,
      23.999999,
      769.5000009375),
     'fontname': 'DAAAAA+ArialMT',
     'adv': 5.928583833169532,
     'upright': True,
     'x0': 23.999999,
     'y0': 767.805748130181,
     'x1': 28.446436689608902,
     'y1': 775.8007476470559,
     'width': 4.446437689608903,
     'height': 7.994999516874941,
     'size': 7.994999516874941,
     'mcid': None,
     'tag': None,
     'object_type': 'char',
     'page_number': 1,
     'ncs': 'DeviceRGB',
     'text': '1',
     'stroking_color': (0, 0, 0),
     'stroking_pattern': None,
     'non_stroking_color': (0, 0, 0),
     'non_stroking_pattern': None,
     'top': 16.199252352944086,
     'bottom': 24.194251869819027,
     'doctop': 16.199252352944086},
    {'matrix':

In [491]:
SECTION_MIN_SIZE = 28.4
SECTION_MAX_SIZE = 28.6
SUBSECTION_MIN_SIZE = 26.9
SUBSECTION_MAX_SIZE = 27

In [492]:
def detect_headers_and_footers(pdf_path):
    header = ""
    footers = []
    ref_link = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_lines()
            
            if not header:
                header = txt[0]['text']
                
            footer = txt[-1]['text']
            if footer not in footers:
                footers.append(footer)
                
            ref_link = footer.split()[0]
                
    return header, footers, ref_link

In [493]:
header, footers, ref_link = detect_headers_and_footers(pdf_path)

In [494]:
def clean_content(content):
    content = re.sub(r'\n', ' ', content)
    content = re.sub(r'\s+', ' ', content)
    content = content.replace(f'{header}', '')
    for footer in footers:
        content = content.replace(footer, '')
    return content

In [495]:
def extract_hyperlinks(pdf_path):
    hyperlinks = []
    reader = PdfReader(pdf_path)
    for page_number, page in enumerate(reader.pages, start=1):
        if "/Annots" in page:
            for annot in page["/Annots"]:
                annotation = annot.get_object()
                if "/A" in annotation and "/URI" in annotation["/A"]:
                    uri = annotation["/A"]["/URI"]
                    rect = annotation.get("/Rect", "N/A")
                    hyperlinks.append((page_number, uri, rect))
    return hyperlinks

In [496]:
def format_text_with_links(text, hyperlinks, page_num, line_coords):
    for pnum, uri, rect in hyperlinks:
        if pnum == page_num:
            # Line coordinates (x0, top, x1, bottom)
            line_x0, line_y0, line_x1, line_y1 = line_coords

            # Rect for hyperlink (x0, y0, x1, y1)
            rect_x0, rect_y0, rect_x1, rect_y1 = rect

            # Check if the bounding boxes of the text line and hyperlink overlap
            if (rect_x0 <= line_x1 and rect_x1 >= line_x0 and
                rect_y0 <= line_y1 and rect_y1 >= line_y0):
                # If the bounding boxes overlap, format the text as a hyperlink
                text = f"[{text}]({uri})"
                break
    print(text)
    return text

In [497]:
def detect_section_with_content(pdf_path, skip_tags=None, category=None):
    if skip_tags is None:
        skip_tags = []
    
    sections_with_content = []
    current_section = None
    current_subsection = None
    current_content = []
    subsection_indexes = {}
    
    

    with pdfplumber.open(pdf_path) as pdf:
        hyperlinks = extract_hyperlinks(pdf_path)
        for page_num, page in enumerate(pdf.pages):
            text_with_coords = page.extract_text_lines()
            for line in text_with_coords:
                text = line['text']
                bbox = (line['x0'], line['top'], line['x1'], line['bottom'])
                text = format_text_with_links(text, hyperlinks, page_num, bbox)
                
                if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                    if current_section and current_content:
                        sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                    if text not in skip_tags:  # Skip section if in skip_tags
                        current_section = text
                        current_content = []
                        if category != None:
                            sections_with_content.append({
                                'tag': category,
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        else:
                            sections_with_content.append({
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        section_index = len(sections_with_content) - 1
                        subsection_indexes = {}
                elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                    if current_subsection and current_content:
                        subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                        if current_subsection in subsection_indexes:
                            sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                        else:
                            if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                                sections_with_content[section_index]['subsections'].append({
                                    'subsection': current_subsection,
                                    'content': subsection_content
                                })
                                subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                    current_subsection = text
                    current_content = []
                elif current_subsection != "On this page":
                    current_content.append(line['text'])
                    
                  
            if current_subsection and current_content:
                subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                if current_subsection in subsection_indexes:
                    sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                else:
                    if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                        sections_with_content[section_index]['subsections'].append({
                            'subsection': current_subsection,
                            'content': subsection_content
                        })
                        subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
            elif current_section and current_content:
                sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip())

    return sections_with_content

In [498]:
test = detect_section_with_content(pdf_path, skip_tags=['Study permit'], category='Study permit')

1/22/25, 7:09 PM Study permit: Get the right documents - Canada.ca
Canada.ca  Immigration and citizenship  Study in Canada as an international student
 Study permit
Study permit
Who can apply
Get the right documents
Proof of acceptance
Provincial attestation letter or territorial attestation letter
Proof of identity
Proof of financial support
Other documents
How to apply
After you apply
Prepare for your arrival
While you study
Get the right documents
https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/get-documents.html#gc-document-nav 1/2
1/22/25, 7:09 PM Study permit: Get the right documents - Canada.ca
You need these documents to apply for a study permit:
proof of acceptance
provincial attestation letter or territorial attestation letter
proof of identity
[proof of financial support](https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/while-you-study.html#gc-document-nav)
[You may also need](https:/