### Note:
- At the bottom of this notebook, it is just an example of a file. Everything needs to be tested (Test with many different files).
- This notebook is not a final file. It is just a draft file.
- The text before added to Pinecone is not perfect in this draft. It needs to be improved.
- Before running the code, please install the necessary libraries.

In [1]:
from langchain.schema import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import re
from langchain.text_splitter import CharacterTextSplitter
import warnings
import pdfplumber
from spellchecker import SpellChecker
warnings.filterwarnings("ignore")

In [2]:
pdf_path = "../data/prway.pdf"

In [3]:
def detect_headers_and_footers(pdf_path):
    header = ""
    footers = []
    ref_link = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_lines()
            
            if not header:
                header = txt[0]['text']
                
            footer = txt[-1]['text']
            if footer not in footers:
                footers.append(footer)
                
            ref_link = footer.split()[0]
                
    return header, footers, ref_link

In [4]:
def clean_content(content, txt_removed=None, header=None, footers=None):
    content = content.encode('utf-8').decode('utf-8')
    content = re.sub(r'[^\x00-\x7F]+', '', content)  # Remove non-ASCII characters
    content = re.sub(r'\n', ' ', content)            # Replace newlines with spaces
    content = re.sub(r'Date modified: \d\d\d\d-\d\d-\d\d', '', content)  # Remove date modified
    
    if txt_removed is not None:
        for txt in txt_removed:
            content = content.replace(txt, '')

    content = re.sub(r'\s+', ' ', content)           # Replace multiple spaces with a single space

    if header is not None and footers is not None:
        content = content.replace(header, '')            # Remove header
        for footer in footers:
            content = content.replace(footer, '')        # Remove footers

    return content

In [5]:
def extract_hyperlinks(pdf_path):
    hyperlinks = []
    with pdfplumber.open(pdf_path) as pdf:
        for _, page in enumerate(pdf.pages):
            for annotation in page.annots:
                if str(annotation['uri']).startswith(r'http://') or str(annotation['uri']).startswith(r'https://'):
                    uri = annotation.get("uri", None)
                    if uri:
                        # Get the bounding box coordinates for the link
                        x0, y0, x1, y1 = annotation['x0'], annotation['top'], annotation['x1'], annotation['bottom']
                        
                        # Extract text within the bounding box
                        text_content = ""
                        for char in page.chars:
                            if x0 <= char['x0'] and char['x1'] <= x1 and y0 <= char['top'] and char['bottom'] <= y1:
                                text_content += char.get('text', '')
                        
                        hyperlink = {
                            "uri": uri,
                            "text": text_content.strip()
                        }
                        hyperlinks.append(hyperlink)
    return hyperlinks

In [6]:
def check_tables(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate over all pages in the PDF
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract tables from the current page
            tables = page.find_tables()

            # If no tables are found on the page, continue to the next page
            if not tables:
                continue

            # Check each table on the page
            for table in tables:
                # Get the number of rows and columns in the table
                num_rows = len(table.rows)
                num_columns = len(table.rows[0].cells) if table.rows else 0

                # Adjusted condition to check for at least 2 rows and 2 columns
                if num_rows >= 2 and num_columns >= 2:
                    return True  # Return True immediately if a matching table is found

    return False

In [7]:
def extract_table_content(pdf_path):
    is_headers = False
    headers = []
    table_content = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for table in page.extract_tables():
                if not table:
                    continue
                
                for row in table:
                    if not is_headers:
                        headers = row
                        is_headers = True
                    else:
                        table_content.append(dict(zip(headers, row)))
    
    # Clean table content
    cleaned_table_content = [
        {clean_content(key): clean_content(value) for key, value in entry.items()} for entry in table_content
    ]
    
    # Remove empty entries
    cleaned_table_content = [entry for entry in cleaned_table_content if any(value.strip() for value in entry.values())]
    return cleaned_table_content

In [8]:
def detect_section_with_content(pdf_path, skip_tags=None, category=None, header=None, footers=None):
    if skip_tags is None:
        skip_tags = []
    else:
        skip_tags = [tag.lower() for tag in skip_tags]
    
    sections_with_content = []
    current_section = None
    current_subsection = None
    current_content = []
    subsection_indexes = {}
    section_index = None
    
    
    SECTION_MIN_SIZE = 28
    SECTION_MAX_SIZE = 29
    SUBSECTION_MIN_SIZE = 26
    SUBSECTION_MAX_SIZE = 27
    is_section = False
    is_subsection = False
    

    with pdfplumber.open(pdf_path) as pdf:
        if not check_tables(pdf_path):
            is_section = [(line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE) for page in pdf.pages for line in page.extract_text_lines()]
            is_subsection = [(line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE) for page in pdf.pages for line in page.extract_text_lines()]
            for page_num, page in enumerate(pdf.pages):
                text_with_coords = page.extract_text_lines()
                for line in text_with_coords:
                    text = line['text']
                    
                    if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                        if current_section and current_content:
                            sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                        if text.lower() not in skip_tags:  # Skip section if in skip_tags
                            current_section = line['text']
                            current_content = []
                            if category != None:
                                sections_with_content.append({
                                    'tags': category,
                                    'section': current_section,
                                    'subsections': [],
                                    'content': ''
                                })
                            else:
                                sections_with_content.append({
                                    'section': current_section,
                                    'subsections': [],
                                    'content': ''
                                })
                            section_index = len(sections_with_content) - 1
                            subsection_indexes = {}
                    elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                        if current_subsection and current_content:
                            subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                            if current_subsection in subsection_indexes:
                                sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                            else:
                                if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                                    sections_with_content[section_index]['subsections'].append({
                                        'content': subsection_content
                                    })
                                    subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                        current_subsection = line['text']
                        current_content = []
                    elif current_subsection != "On this page":
                        current_content.append(line['text'])
                    
                        
                    if current_subsection and current_content:
                        subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                        if current_subsection in subsection_indexes:
                            sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                        else:
                            if current_subsection not in skip_tags:  # Skip subsection if in skip_tags
                                sections_with_content[section_index]['subsections'].append({
                                    'content': subsection_content
                                })
                                subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1   
                    elif current_section and current_content:
                        sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                        
                    
                    # handle no section or subsection detected
                    if not any(is_section) and not any(is_subsection) and current_content:
                        current_section = "Other Resources"
                        sections_with_content.append({
                            'section': current_section,
                            'subsections': [],
                            'content': clean_content(' '.join(current_content).strip(), header=header, footers=footers),
                        })
            return sections_with_content
        
        else:
            is_section = [(line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE) for page in pdf.pages for line in page.extract_text_lines()]
            is_subsection = [(line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE) for page in pdf.pages for line in page.extract_text_lines()]
            for page in pdf.pages:
                bboxes = [table for table in page.find_tables()]
                if not bboxes:
                    for line in page.extract_text_lines():
                        text = line['text']
                        if text not in skip_tags:
                            # Check if line is a section
                            if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                                if current_section and current_content:
                                    sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                                if text.lower() not in skip_tags:
                                    current_section = text
                                    current_content = []
                                    if category != None:
                                        sections_with_content.append({
                                            'tags': category,
                                            'section': current_section,
                                            'subsections': [],
                                            'table_content': [],
                                            'content': ''
                                        })
                                    else:
                                        sections_with_content.append({
                                            'section': current_section,
                                            'subsections': [],
                                            'table_content': [],
                                            'content': ''
                                        })
                                    section_index = len(sections_with_content) - 1
                                    subsection_indexes = {}
                            
                            # Check if line is a subsection
                            elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                                if current_subsection and current_content:
                                    subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                                    if current_subsection in subsection_indexes:
                                        sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                                    else:
                                        if current_subsection not in skip_tags:
                                            sections_with_content[section_index]['subsections'].append({
                                                'content': subsection_content
                                            })
                                            subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                                current_subsection = text
                                current_content = []
                            elif current_subsection != "On this page":
                                current_content.append(text)
                        if current_subsection and current_content:
                            subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                            if current_subsection in subsection_indexes:
                                sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                            else:
                                if current_subsection not in skip_tags:
                                    sections_with_content[section_index]['subsections'].append({
                                        'content': subsection_content
                                    })
                                    subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                        elif current_section and current_content:
                            sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                            
                    if not any(is_section) and not any(is_subsection) and current_content:
                        current_section = "Other Resources"
                        sections_with_content.append({
                            'section': current_section,
                            'subsections': [],
                            'content': clean_content(' '.join(current_content).strip(), header=header, footers=footers),
                            'table_content': [],
                        })
                else:
                    table_content = None
                    table_content_extracted = set()
                    bboxes = []
                    tables = page.find_tables()
                    for table in tables:
                        bboxes.append(table.bbox)
                    for table_bbox in bboxes:
                        if table_bbox not in table_content_extracted:  # Check if the bbox is already processed
                            table_extraction = extract_table_content(pdf_path)  # Extract table content
                            if table_extraction:
                                # Avoid adding duplicate table content
                                if table_extraction not in sections_with_content[section_index]['table_content']:
                                    sections_with_content[section_index]['table_content'].append(table_extraction)
                                table_content_extracted.add(table_bbox)
                    # # Extract content outside of tables
                    for line in page.extract_text_lines():
                        text = line['text']
                        line_bbox = (line['x0'], line['top'], line['x1'], line['bottom'])
                            
                        # Make sure the line is not part of a table
                        if not any(
                            bbox[0] <= line_bbox[2] and bbox[2] >= line_bbox[0] and
                            bbox[1] <= line_bbox[3] and bbox[3] >= line_bbox[1]
                            for bbox in bboxes
                        ):
                            if text not in skip_tags:
                                if line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SECTION_MIN_SIZE <= line['chars'][0]['size'] < SECTION_MAX_SIZE:
                                    if current_section and current_content:
                                        sections_with_content[-1]['content'] = ' '.join(current_content).strip()
                                    if text.lower() not in skip_tags:
                                        current_section = text
                                        current_content = []
                                        if category != None:
                                            sections_with_content.append({
                                                'tags': category,
                                                'section': current_section,
                                                'subsections': [],
                                                'table_content': [],
                                                'content': ''
                                            })
                                        else:
                                            sections_with_content.append({
                                                'section': current_section,
                                                'subsections': [],
                                                'table_content': [],
                                                'content': ''
                                            })
                                        section_index = len(sections_with_content) - 1
                                        subsection_indexes = {}
                                elif line['chars'][0]['fontname'] == 'CAAAAA+Lato-Bold' and SUBSECTION_MIN_SIZE <= line['chars'][0]['size'] < SUBSECTION_MAX_SIZE:
                                    if current_subsection and current_content:
                                        subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                                        if current_subsection in subsection_indexes:
                                            sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                                        else:
                                            if current_subsection not in skip_tags:
                                                sections_with_content[section_index]['subsections'].append({
                                                    'content': subsection_content
                                                })
                                                subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                                    current_subsection = text
                                    current_content = []
                                elif current_subsection != "On this page":
                                    current_content.append(text)
                            if current_subsection and current_content:
                                subsection_content = current_subsection + ": " + clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                                if current_subsection in subsection_indexes:
                                    sections_with_content[section_index]['subsections'][subsection_indexes[current_subsection]]['content'] = subsection_content
                                else:
                                    if current_subsection not in skip_tags:
                                        sections_with_content[section_index]['subsections'].append({
                                            'content': subsection_content
                                        })
                                        subsection_indexes[current_subsection] = len(sections_with_content[section_index]['subsections']) - 1
                            elif current_section and current_content:
                                sections_with_content[-1]['content'] = clean_content(' '.join(current_content).strip(), header=header, footers=footers)
                    if not any(is_section) and not any(is_subsection) and current_content:
                        current_section = "Other Resources"
                        sections_with_content.append({
                            'section': current_section,
                            'subsections': [],
                            'content': clean_content(' '.join(current_content).strip(), header=header, footers=footers),
                            'table_content': table_content,
                        })
            return sections_with_content

In [9]:
def combine_subsection_content(sections):
    for section in sections:
        subsection_content = ' '.join([subsection['content'] for subsection in section['subsections']])
        section['embedding_text'] = section['section'] + " > " + section['content'] + " " + subsection_content + "\n\n"
        
        # Remove content and subsections
        del section['content']
        del section['subsections']
    return sections

In [10]:
def combine_table_content(sections, pdf_path):
    if check_tables(pdf_path):
        paragraphs = []
        for section in sections:
            for tables in section['table_content']:
                for table in tables:
                    paragraph = "\n -".join([f"{key.strip()}: {value.strip()}" for key, value in table.items()])
                    paragraphs.append(paragraph)
            
            section['embedding_text'] += "\n\n".join(paragraphs)
            del section['table_content']   
        return sections
    else:
        return sections

In [11]:
def filter_hyperlinks(hyperlinks, section):
    content = section['embedding_text']
    filtered_hyperlinks = []
    for hyperlink in hyperlinks:
        if hyperlink['text'].lower() in content.lower() and hyperlink['text'] != '':
            filtered_hyperlinks.append(hyperlink)
    return filtered_hyperlinks

In [12]:
def finalize_document(hyperlinks, sections, ref_link):
    for section in sections:
        section['hyperlinks'] = filter_hyperlinks(hyperlinks, section)
        section['ref_link'] = ref_link
        del section['section']
    return sections

In [13]:
def data_preprocessing(pdf_path, skip_tags=None, category=None):
    headers, footers, ref_link = detect_headers_and_footers(pdf_path)
    hyperlinks = extract_hyperlinks(pdf_path)
    sections = detect_section_with_content(pdf_path, skip_tags=skip_tags, category=category, header=headers, footers=footers)
    sections = combine_subsection_content(sections)
    sections = combine_table_content(sections, pdf_path)
    docs = finalize_document(hyperlinks, sections, ref_link)
    return docs

#### Testing functions:

In [14]:
test = data_preprocessing(pdf_path, skip_tags=["On this page", "Study permit", "Work in Canada after you graduate"], category=["Study Permit"])

In [15]:
final_docs = []
for doc in test:
    final_doc = Document(page_content=doc['embedding_text'], metadata={'tags': doc['tags'], 'hyperlinks': doc['hyperlinks'], 'ref_link': doc['ref_link']})
    final_docs.append(final_doc)

In [17]:
for doc in final_docs:
    print(doc)

page_content='Find your path to permanent residence > Youve studied in Canada and maybe you even have Canadian work experience. Now, youd like to live here permanently. We have options for you to become a permanent resident! The Come to Canada tool can help you explore your options. You can also use the cheat sheet below to compare programs. Visit the programs eligibility page to get all the details. Before you compare programs, here are two key terms you need to know:   Canadian Language Benchmark (CLB) The Canadian standard used to describe, measure and recognize English language ability of adult immigrants and prospective immigrants who plan to live and work in Canada, or apply for citizenship. The Niveaux de comptence linguistique canadiens (NCLC) is used to assess abilities in the French language. National Occupation Code (NOC) The National Occupation Classification is a list of all the occupations in the Canadian labour market. It describes each job according to training, educati