In [52]:
from langchain.schema import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import re
from langchain.text_splitter import CharacterTextSplitter
import warnings
import pdfplumber
from spellchecker import SpellChecker
warnings.filterwarnings("ignore")

In [53]:
pdf_path = "../data/Study permit_ Get the right documents - Canada.ca.pdf"

In [54]:
def detect_headers_and_footers(pdf_path):
    header = ""
    footers = []
    ref_link = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_lines()
            
            if not header:
                header = txt[0]['text']
                
            footer = txt[-1]['text']
            if footer not in footers:
                footers.append(footer)
                
            ref_link = footer.split()[0]
                
    return header, footers, ref_link

In [55]:
header, footers, ref_link = detect_headers_and_footers(pdf_path)

In [56]:
def clean_content(content, txt_removed=None):
    content = content.encode('utf-8').decode('utf-8')
    content = re.sub(r'\ue107', '', content)
    content = re.sub(r'\n', ' ', content)
    content = content.replace(f'{header}', '')
    for txt in txt_removed:
        content = content.replace(txt, '')
    content = re.sub(r'\ue092 Date modified: \d\d\d\d-\d\d-\d\d', '', content)
    for footer in footers:
        content = content.replace(footer, '')
    content = re.sub(r'\s+', ' ', content)
    return content

In [58]:
def extract_hyperlinks(pdf_path):
    hyperlinks = []
    with pdfplumber.open(pdf_path) as pdf:
        for _, page in enumerate(pdf.pages):
            for annotation in page.annots:
                if str(annotation['uri']).startswith(r'http://') or str(annotation['uri']).startswith(r'https://'):
                    uri = annotation.get("uri", None)
                    if uri:
                        # Get the bounding box coordinates for the link
                        x0, y0, x1, y1 = annotation['x0'], annotation['top'], annotation['x1'], annotation['bottom']
                        
                        # Extract text within the bounding box
                        text_content = ""
                        for char in page.chars:
                            if x0 <= char['x0'] and char['x1'] <= x1 and y0 <= char['top'] and char['bottom'] <= y1:
                                text_content += char.get('text', '')
                        
                        hyperlink = {
                            "uri": uri,
                            "text": text_content.strip()
                        }
                        hyperlinks.append(hyperlink)
    return hyperlinks

In [59]:
def detect_section_with_content(pdf_path, skip_tags=None, category=None):
    if skip_tags is None:
        skip_tags = []
    
    sections_with_content = []
    current_section = None
    current_subsection = None
    current_content = []
    subsection_indexes = {}
    
    
    SECTION_MIN_SIZE = 28
    SECTION_MAX_SIZE = 29
    SUBSECTION_MIN_SIZE = 26
    SUBSECTION_MAX_SIZE = 27
    

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text_with_coords = page.extract_text_lines()
            for line in text_with_coords:
                text = line['text']
                
                if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                    if current_section and current_content:
                        sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                    if text not in skip_tags:  # Skip section if in skip_tags
                        current_section = line['text']
                        current_content = []
                        if category != None:
                            sections_with_content.append({
                                'tag': category,
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        else:
                            sections_with_content.append({
                                'section': current_section,
                                'subsections': [],
                                'content': ''
                            })
                        section_index = len(sections_with_content) - 1
                        subsection_indexes = {}
                elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                    if current_subsection and current_content:
                        subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                        if current_subsection in subsection_indexes:
                            sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                        else:
                            if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                                sections_with_content[section_index]['subsections'].append({
                                    'content': subsection_content
                                })
                                subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                    current_subsection = line['text']
                    current_content = []
                elif current_subsection != "On this page":
                    current_content.append(line['text'])
                  
                    
                if current_subsection and current_content:
                    subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip())
                    if current_subsection in subsection_indexes:
                        sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                    else:
                        if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                            sections_with_content[section_index]['subsections'].append({
                                'content': subsection_content
                            })
                            subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1   
                elif current_section and current_content:
                    sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip())
                    
                
                # handle no section or subsection detected
                if not current_section and not current_subsection and current_content:
                    current_section = "Other Resources"
                    sections_with_content.append({
                        'section': current_section,
                        'subsections': [],
                        'content': clean_content(' '.join(current_content).strip())
                    })
    return sections_with_content

In [60]:
def combine_subsection_content(sections):
    for section in sections:
        subsection_content = ' '.join([subsection['content'] for subsection in section['subsections']])
        section['embedding_text'] = section['section'] + " > " + section['content'] + " " + subsection_content
        
        # Remove content and subsections
        del section['content']
        del section['subsections']
    return sections

In [61]:
def filter_hyperlinks(hyperlinks, doc):
    content = doc['embedding_text']
    filtered_hyperlinks = []
    for hyperlink in hyperlinks:
        if hyperlink['text'].lower() in content.lower() and hyperlink['text'] != '':
            filtered_hyperlinks.append(hyperlink)
    return filtered_hyperlinks

In [62]:
def finalize_document(hyperlinks, docs):
    for doc in docs:
        doc['hyperlinks'] = filter_hyperlinks(hyperlinks, doc)
    return docs

### PENDING

In [63]:
pdf_path = "../data/prway.pdf"

In [68]:
with pdfplumber.open(pdf_path) as pdf:
    sample = []
    for page in pdf.pages:
        sample.append(page.extract_table())

In [69]:
sample

[None,
 None,
 [['Progr\nam\n\ue093 \ue094',
   'Langua\nge\nskills\n\ue093 \ue094',
   'Type of\nwork\nexperience\n\ue093 \ue094',
   'Amount of\nwork\nexperience\n\ue093 \ue094',
   'Job\noffer \ue093 \ue094',
   'Education\n\ue093 \ue094'],
  ['Atlant\nic\nImmi\ngratio\nn Pilot',
   'You\nhave\ninterme\ndiate\nEnglish\nor\nFrench\nskills\n(CLB 4)',
   'Not required',
   'Not\nrequired',
   'Required\nJob offer must\nbe TEER\ncategory\n0, 1, 2, 3\nor 4\nlast at\nleast 1\nyear from\nthe date\npermane\nnt\nresidence\nis\ngranted\nbe for an\nemployer\nin Atlantic\nCanada',
   'You must\nhave\ngraduated\nfrom a\npublicly\nfunded\npost-\nsecondary\ninstitution\nin Atlantic\nCanada\nProgram\nmust have\nbeen at\nleast 2\nyears']],
 [['\ue093 \ue094',
   '\ue093 \ue094',
   '\ue093 \ue094',
   '\ue093 \ue094',
   '\ue093 \ue094',
   '\ue093 \ue094'],
  ['Cana\ndian\nExperi\nence\nClass',
   'You\nhave\ninterme\ndiate or\nstrong\nEnglish\nor\nFrench\nskills\n(CLB 7\nif your\nTEER is\n0 or 1)\