In [121]:
from langchain.schema import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import re
from langchain.text_splitter import CharacterTextSplitter
import warnings
import pdfplumber
from spellchecker import SpellChecker
warnings.filterwarnings("ignore")

In [122]:
pdf_path = "../data/Study permit_ Who can apply - Canada.ca.pdf"

In [124]:
SECTION_MIN_SIZE = 28.4
SECTION_MAX_SIZE = 28.6
SUBSECTION_MIN_SIZE = 26.9
SUBSECTION_MAX_SIZE = 27

In [125]:
def detect_headers_and_footers(pdf_path):
    header = ""
    footers = []
    ref_link = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_lines()
            
            if not header:
                header = txt[0]['text']
                
            footer = txt[-1]['text']
            if footer not in footers:
                footers.append(footer)
                
            ref_link = footer.split()[0]
                
    return header, footers, ref_link

In [126]:
header, footers, ref_link = detect_headers_and_footers(pdf_path)

In [127]:

def clean_content(content, txt_removed=None):
    content = content.encode('utf-8').decode('utf-8')
    content = re.sub(r'\ue107', '', content)
    content = re.sub(r'\n', ' ', content)
    content = content.replace(f'{header}', '')
    txt_replace = "Mail delivery is resuming but there will be delays We’re still processing applications normally, but there may still be delays with sending applications mailed within or to Canada getting mail from us We still encourage you to apply online if possible. If you need to apply on paper, use a courier to send us your application more quickly."
    content = content.replace(txt_replace, '')
    content = re.sub(r'\ue092 Date modified: \d\d\d\d-\d\d-\d\d', '', content)
    for footer in footers:
        content = content.replace(footer, '')
    content = re.sub(r'\s+', ' ', content)
    return content

In [128]:
def extract_hyperlinks(pdf_path):
    hyperlinks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            for annotation in page.annots:
                if str(annotation['data']["Subtype"]) == "/'Link'":
                    uri = annotation.get("uri", None)
                    if uri:
                        # Get the bounding box coordinates for the link
                        x0, y0, x1, y1 = annotation['x0'], annotation['top'], annotation['x1'], annotation['bottom']
                        
                        # Extract text within the bounding box
                        text_content = ""
                        for char in page.chars:
                            if x0 <= char['x0'] and char['x1'] <= x1 and y0 <= char['top'] and char['bottom'] <= y1:
                                text_content += char.get('text', '')
                        
                        hyperlink = {
                            "uri": uri,
                            "text": text_content.strip()
                        }
                        hyperlinks.append(hyperlink)
    return hyperlinks

In [129]:
def detect_section_with_content(pdf_path, skip_tags=None, category=None):
    if skip_tags is None:
        skip_tags = []
    
    sections_with_content = []
    current_section = None
    current_subsection = None
    current_content = []
    subsection_indexes = {}
    
    

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text_with_coords = page.extract_text_lines()
            for line in text_with_coords:
                text = line['text']
                
                if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                    if current_section and current_content:
                        sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                    if text not in skip_tags:  # Skip section if in skip_tags
                        current_section = line['text']
                        current_content = []
                        if category != None:
                            sections_with_content.append({
                                'tag': category,
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        else:
                            sections_with_content.append({
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        section_index = len(sections_with_content) - 1
                        subsection_indexes = {}
                elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                    if current_subsection and current_content:
                        subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                        if current_subsection in subsection_indexes:
                            sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                        else:
                            if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                                sections_with_content[section_index]['subsections'].append({
                                    'content': subsection_content
                                })
                                subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                    current_subsection = line['text']
                    current_content = []
                elif current_subsection != "On this page":
                    current_content.append(line['text'])
                  
                    
                if current_subsection and current_content:
                    subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                    if current_subsection in subsection_indexes:
                        sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                    else:
                        if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                            sections_with_content[section_index]['subsections'].append({
                                'content': subsection_content
                            })
                            subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1   
                elif current_section and current_content:
                    sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip())
        
    return sections_with_content

In [130]:
test = detect_section_with_content(pdf_path, skip_tags=['Study permit'], category='Study permit')

In [131]:
test

[{'tag': 'Study permit',
  'section': 'Who can apply',
  'subsections': [{'content': 'Eligibility requirements: You can come to Canada to study if you are enrolled at a DLI prove you have enough money to pay for your tuition fees living expenses for yourself and any family members who come with you to Canada and return transportation for yourself and any family members who come with you to Canada obey the law, have no criminal record and get a police certificate (if required) are in good health and get a medical exam (if required) and prove to an officer that you will leave Canada when your study permit expires'},
   {'content': 'Your responsibilities as a student: While studying in Canada you must make progress towards completing your program respect any conditions listed on your study permit stop studying if you no longer meet the requirements Depending on your case, there may be conditions on your study permit such as if you’re allowed to work in Canada the specific date you must le

In [132]:
def combine_subsection_content(sections):
    for section in sections:
        subsection_content = ' '.join([subsection['content'] for subsection in section['subsections']])
        section['embedding_text'] = section['section'] + " > " + section['content'] + " " + subsection_content
        
        # Remove content and subsections
        del section['content']
        del section['subsections']
    return sections

In [133]:
combine_test = combine_subsection_content(test)

In [134]:
combine_test

[{'tag': 'Study permit',
  'section': 'Who can apply',
  'embedding_text': 'Who can apply >  Eligibility requirements: You can come to Canada to study if you are enrolled at a DLI prove you have enough money to pay for your tuition fees living expenses for yourself and any family members who come with you to Canada and return transportation for yourself and any family members who come with you to Canada obey the law, have no criminal record and get a police certificate (if required) are in good health and get a medical exam (if required) and prove to an officer that you will leave Canada when your study permit expires Your responsibilities as a student: While studying in Canada you must make progress towards completing your program respect any conditions listed on your study permit stop studying if you no longer meet the requirements Depending on your case, there may be conditions on your study permit such as if you’re allowed to work in Canada the specific date you must leave Canada w

In [135]:
hyperlinks = extract_hyperlinks(pdf_path)

In [136]:
def filter_hyperlinks(hyperlinks, doc):
    content = doc['embedding_text']
    filtered_hyperlinks = []
    for hyperlink in hyperlinks:
        if hyperlink['text'].lower() in content.lower() and hyperlink['text'] != '':
            filtered_hyperlinks.append(hyperlink)
    return filtered_hyperlinks

In [137]:
sample = filter_hyperlinks(hyperlinks, combine_test[0])

In [138]:
def finalize_document(doc):
    doc['hyperlinks'] = filter_hyperlinks(hyperlinks, doc)
    return doc

In [139]:
final = finalize_document(combine_test[0])

In [140]:
final

{'tag': 'Study permit',
 'section': 'Who can apply',
 'embedding_text': 'Who can apply >  Eligibility requirements: You can come to Canada to study if you are enrolled at a DLI prove you have enough money to pay for your tuition fees living expenses for yourself and any family members who come with you to Canada and return transportation for yourself and any family members who come with you to Canada obey the law, have no criminal record and get a police certificate (if required) are in good health and get a medical exam (if required) and prove to an officer that you will leave Canada when your study permit expires Your responsibilities as a student: While studying in Canada you must make progress towards completing your program respect any conditions listed on your study permit stop studying if you no longer meet the requirements Depending on your case, there may be conditions on your study permit such as if you’re allowed to work in Canada the specific date you must leave Canada wher