# Section Extraction on Three Sample Documents

This notebook builds off the *Section Extraction - 1 Document* notebook to implement section extraction at scale on a small number (3) of sample documents.

Use this repo's [Github Codespace](https://codespaces.new/CamDuffy1/rag_bootcamp) to run the notebook with all required dependencies installed.
  - Specifically, the executable <code>[wkhtmltopdf](https://wkhtmltopdf.org/)</code> comes installed in the Github Codespace configuration. This is needed to use convert webpages to PDF documents using the <code>[pdfkit](https://pypi.org/project/pdfkit/)</code> library.
  - These PDF documents can be too large (>25 GB) to store in the Github repo.

#### Download S-1 PDF Documents and Create Truncated Versions

In [1]:
s1_url_dict = {
    'reddit': 'https://www.sec.gov/Archives/edgar/data/1713445/000162828024006294/reddits-1q423.htm',
    'facebook': 'https://www.sec.gov/Archives/edgar/data/1326801/000119312512034517/d287954ds1.htm',
    'turo': 'https://www.sec.gov/Archives/edgar/data/1514587/000119312522005696/d145731ds1.htm',
}

In [12]:
import pdfkit
import os

current_path = os.getcwd()
output_dir = 'webpage_to_pdf'
output_dir_path = os.path.join(current_path, output_dir)

os.makedirs(output_dir_path) if not os.path.exists(output_dir_path) else None

for key, url in s1_url_dict.items():
    file_name = f"{key}.pdf"
    output_file_path = os.path.join(output_dir_path, file_name)

    pdfkit.from_url(url, output_file_path) if not os.path.exists(output_file_path) else print(f'File already exists at: {output_file_path}')

In [53]:
# Truncate downloaded PDFs and add them to Truncated dir
import fitz

truncated_dir = os.path.join(output_dir_path, 'Truncated')                   # dir to save truncated files to
os.makedirs(truncated_dir) if not os.path.exists(truncated_dir) else None    # create dir if not exists

for item in os.listdir(output_dir_path):
    item_path = os.path.join(output_dir_path, item)     # Construct the full path of the item
    
    # Check if the item is a file
    if os.path.isfile(item_path) and item_path.lower().endswith('.pdf'):

        filename = os.path.basename(item_path)
        filename, ext = os.path.splitext(filename)

        truncated_filename = f"{filename} - Truncated - TOC{ext}"
        truncated_out_path = os.path.join(truncated_dir, truncated_filename)  # path to save the truncated file

        if not os.path.exists(truncated_out_path):
            source_doc = fitz.open(item_path)
            truncated_doc = fitz.open()
            truncated_doc.insert_pdf(source_doc, from_page=0, to_page=9)
            truncated_doc.save(truncated_out_path)
            print(f"Saved truncated doc at: {truncated_out_path}")
            truncated_doc.close()
            source_doc.close()
        else:
            print(f"Truncated file already exists at: {truncated_out_path}")


Truncated file already exists at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/Truncated/reddit - Truncated - TOC.pdf
Truncated file already exists at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/Truncated/turo - Truncated - TOC.pdf
Truncated file already exists at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/Truncated/facebook - Truncated - TOC.pdf


#### Extract Table of Contents from Truncated Documents and Generate List of Sections

In [54]:
# load Cohere API key from environment variables in .env file
from dotenv import load_dotenv
import os
dotenv_path = '/workspaces/rag_bootcamp/.env'
_ = load_dotenv(dotenv_path=dotenv_path) if os.path.exists(dotenv_path) else print(f'No file found at: {dotenv_path}\nPlease create a .env file and include your API keys.')

In [143]:
# llama_index.llms.cohere does not support command-r model -- Use LiteLLM instead
from llama_index.llms.litellm import LiteLLM
llm = LiteLLM(
    model="command-r",
    temperature=0,
)

from llama_index.embeddings.cohere import CohereEmbedding
embed_model = CohereEmbedding(
    cohere_api_key = os.environ['COHERE_API_KEY'],
    model_name="embed-english-v3.0",
    input_type="search_document",
)

from llama_index.core import ServiceContext
service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    llm=llm,
    chunk_size=1024,
)

  service_context = ServiceContext.from_defaults(


In [179]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.postprocessor.cohere_rerank import CohereRerank
import ast    # Built-in Abstract Syntax Trees module - Can convert a string contining a list to a list object

sections_dict = {}  # instantiate dict to store lists of extracted sections from each document -- Format: {'company1': ['section1', 'section2', ...]}
prompt = '''
    Find the document's table of contents then list the sections in it as string elements in a Python list.
    Your output should include nothing else. 'Table of Contents' should not be included as an item in the Python list.
'''

for item in os.listdir(truncated_dir):
    truncated_path = os.path.join(truncated_dir, item)     # Construct the full path of the truncated file

    filename = os.path.basename(truncated_path)
    filename, ext = os.path.splitext(filename)
    filename = filename.split(' ')[0]

    reader = SimpleDirectoryReader(input_files=[truncated_path])
    documents = reader.load_data()    # get truncated document contents

    truncated_index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=False)
    
    cohere_rerank = CohereRerank(
        top_n=5
    )
    query_engine = truncated_index.as_query_engine(
        node_postprocessors=[cohere_rerank],
        similarity_top_k=8,
    )
    response = query_engine.query(prompt)
    
    document_sections = ast.literal_eval(response.response)
    sections_dict[filename] = document_sections

In [182]:
# display sample output of PDF sections for each doc
for key, value in sections_dict.items():
    print(f"{key} S-1 sections:")
    for i in range(4):
        print(f"    {value[i]}")
    print('    ...')

turo S-1 sections:
    Prospectus summary
    Risk factors
    Special note regarding forward-looking statements
    Market, industry, and other data
    ...
reddit S-1 sections:
    Letter From Our Co-Founder
    Prospectus Summary
    Risk Factors
    Special Note Regarding Forward-Looking Statements
    ...
facebook S-1 sections:
    Prospectus Summary
    Risk Factors
    Special Note Regarding Forward-Looking Statements
    Industry Data and User Metrics
    ...


#### Get the Section Body

In [202]:
from utils import get_item_after, get_words_from_PDF, get_section    # helper functions defined in utils.py

target_section = 'Risk Factors'

for key, value in sections_dict.items():
    next_section = get_item_after(document_sections, target_section)

    file_name = f"{key}.pdf"
    file_path = os.path.join(output_dir_path, file_name)
    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
    else:
        source_doc = fitz.open(file_path)
        lowercase_words = get_words_from_PDF(doc=source_doc, start_page=15, lowercase=True)
        source_doc.close()

        start_str = target_section.lower()
        end_str = next_section.lower()

        body = get_section(
            start_str=start_str,
            end_str=end_str,
            words=lowercase_words,
        )

        if body:
            print(f"Extracted {target_section} section from {file_name}.\n{' '*4}{len(body.split(' '))} words")
            section_output_dir = os.path.join(output_dir_path, 'extracted_sections', key)
            os.makedirs(section_output_dir) if not os.path.exists(section_output_dir) else None

            output_file_name = f'{target_section}.txt'
            section_output_file_path = os.path.join(section_output_dir, output_file_name)
            
            with open(section_output_file_path, "w") as file:
                file.write(body)
            print(f"{' '*4}Saved at: {section_output_file_path}")

Extracted Risk Factors section from turo.pdf.
    44778 words
    Saved at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/extracted_sections/turo/Risk Factors.txt
Extracted Risk Factors section from reddit.pdf.
    44446 words
    Saved at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/extracted_sections/reddit/Risk Factors.txt
Extracted Risk Factors section from facebook.pdf.
    14171 words
    Saved at: /workspaces/rag_bootcamp/document_search/webpage_to_pdf/extracted_sections/facebook/Risk Factors.txt
