# 02 - Download Raw PDF documents to analyze

Use this notebook to download a set of Amazon financial reports to use as input for testing and demonstrating how the `aws-agentic-document-assistant` solution works. You can replace links to your own documents here and customize the code to your use case. Alternatively, you can put your own documents on `Amazon S3` and update the code to use them instead.

Run the below cells to download them.

In [1]:
raw_base_directory = "raw_documents"

In [None]:
raw_base_directory

In [3]:
import os

if not os.path.exists(raw_base_directory):
    os.makedirs(raw_base_directory)

In [4]:
docs_mapping = {
    "Amazon": [
        {
            "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/Amazon-2022-Annual-Report.pdf",
            "year": "2022",
            "pages": [15, 17, 18, 47, 48],
        },
        {
            "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/Amazon-2021-Annual-Report.pdf",
            "year": "2021",
            "pages": [14, 16, 17, 18, 46, 47],
        },
        {"doc_url": "", "year": ""},
    ]
}

In [None]:
import os  # Used for creating directories and handling file paths
import requests  # Used for sending HTTP requests to download files

def download_pdf_files(base_directory, docs_mapping, headers):
    """
    Downloads PDF files from given URLs and organizes them in a directory structure.

    Args:
        base_directory (str): The root directory where files will be stored.
        docs_mapping (dict): A dictionary mapping company names to lists of document metadata.
                             Example format:
                             {
                                 "CompanyA": [{"doc_url": "http://example.com/doc1.pdf", "year": 2020}, ...],
                                 ...
                             }
        headers (dict): Headers to include in the HTTP requests for downloading files.

    Steps:
    1. Ensure the base directory exists.
    2. Create subdirectories for each company.
    3. Download and save each document, skipping already downloaded files or invalid URLs.
    """

    # Ensure the base directory exists
    if not os.path.exists(base_directory):
        os.makedirs(base_directory)  # Create the base directory if it doesn't exist

    # Iterate over each company and its associated documents
    for company, docs in docs_mapping.items():
        company_directory = os.path.join(base_directory, company)  # Path for the company's directory

        # Create a directory for the company if it doesn't exist
        if not os.path.exists(company_directory):
            os.makedirs(company_directory)

        # Process each document for the company
        for doc_info in docs:
            doc_url = doc_info["doc_url"]  # URL of the document
            year = doc_info["year"]  # Year associated with the document

            # Skip empty or missing URLs
            if not doc_url:
                continue

            # Construct the filename using the year
            filename = f"annual_report_{year}.pdf"
            file_path = os.path.join(company_directory, filename)

            # Check if the file already exists
            if os.path.exists(file_path):
                print(f"{filename} already exists for {company}")
            else:
                # Attempt to download the document
                response = requests.get(doc_url, headers=headers)

                if response.status_code == 200:  # Check if the download was successful
                    # Save the document content to the file
                    with open(file_path, "wb") as file:
                        file.write(response.content)
                    print(f"Downloaded {filename} for {company}")
                else:
                    # Print an error message for failed downloads
                    print(
                        f"Failed to download {filename} for {company} "
                        f"(Status Code: {response.status_code})"
                    )


In [None]:
# Define user-agent and headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
download_pdf_files(raw_base_directory, docs_mapping, headers)

In [None]:
!ls {raw_base_directory}/Amazon

## Keep relevant pages

Although you can run the full PDF documents through the solution, to optimize the extraction costs, we suggest that you select the relevant pages from each pdf documents.

In [None]:
%pip install -q pypdf 2> /dev/null

In [None]:
import json  # Used for handling JSON data (serialization and deserialization)
from pypdf import PdfReader, PdfWriter  # For reading and writing PDF files

def keep_relevant_pages_in_pdf(input_pdf_path, output_pdf_path, pages):
    """
    Extracts and saves specific pages from a PDF file.

    Args:
        input_pdf_path (str): Path to the input PDF file.
        output_pdf_path (str): Path to save the new PDF file containing only the relevant pages.
        pages (list): A list of page numbers (1-indexed) to extract from the input PDF.

    Steps:
    1. Read the input PDF file.
    2. Extract the specified pages.
    3. Write the extracted pages to a new PDF file.
    """
    input_pdf = PdfReader(input_pdf_path)  # Load the input PDF
    print(f"Number of pages is {len(input_pdf.pages)}")  # Print the total number of pages in the PDF
    print(f"Relevant pages are {pages}")  # Print the pages to be extracted
    output_pdf = PdfWriter()  # Initialize a PDF writer object

    # Iterate over the list of page numbers
    for page_num in pages:
        # Convert 1-indexed page number to 0-indexed for internal access
        output_pdf.add_page(input_pdf.pages[page_num - 1])

    # Save the new PDF to the specified output path
    with open(output_pdf_path, "wb") as f:
        output_pdf.write(f)

def save_json(json_data, file_path):
    """
    Saves a Python dictionary or list as a JSON file.

    Args:
        json_data (dict or list): The data to be saved as JSON.
        file_path (str): The path where the JSON file will be saved.

    Steps:
    1. Open the file in write mode.
    2. Serialize the Python object into a JSON string and write it to the file.
    """
    with open(file_path, "w") as f:  # Open the file in write mode
        json.dump(json_data, f)  # Serialize the data and write it to the file


In [None]:
import shutil  # For file operations like copying files

def keep_relevant_pages_in_pdfs(
    raw_base_directory, prepared_base_directory, docs_mapping
):
    """
    Processes PDF files by extracting specific pages and saving them into a structured directory,
    while maintaining metadata for further reference.

    Args:
        raw_base_directory (str): The base directory where raw (unprocessed) PDF files are stored.
        prepared_base_directory (str): The base directory where processed PDF files will be stored.
        docs_mapping (dict): A dictionary mapping company names to lists of document metadata.
                             Example format:
                             {
                                 "CompanyA": [
                                     {"doc_url": "http://example.com/doc1.pdf", 
                                      "year": 2020, 
                                      "pages": [1, 2, 3]},
                                     ...
                                 ],
                                 ...
                             }

    Steps:
    1. Create the `prepared_base_directory` if it doesn't exist.
    2. Iterate through each company and its documents, creating subdirectories as needed.
    3. For each document:
       - If specific pages are defined, extract and save those pages into a new PDF.
       - If no pages are defined, copy the original PDF as-is to the prepared directory.
    4. Save metadata about the processed files into a `metadata.json` file in the prepared base directory.

    Returns:
        bool: Always returns `True` to indicate the process is complete.
    """
    metadata = []  # To store metadata for all processed files

    # Ensure the prepared base directory exists
    if not os.path.exists(prepared_base_directory):
        os.makedirs(prepared_base_directory)

    # Iterate over companies and their associated documents
    for company, docs in docs_mapping.items():
        raw_company_directory = os.path.join(raw_base_directory, company)  # Raw files path
        prepared_company_directory = os.path.join(prepared_base_directory, company)  # Processed files path

        # Create a directory for the company in the prepared directory
        if not os.path.exists(prepared_company_directory):
            os.makedirs(prepared_company_directory)

        for doc_info in docs:
            doc_url = doc_info["doc_url"]  # URL of the document
            year = doc_info["year"]  # Year associated with the document
            pages = doc_info.get("pages", [])  # Pages to extract (optional)

            # Skip empty URLs
            if not doc_url:
                continue

            # Prepare metadata entry for the current document
            current_metadata = {
                "company": company,
                "year": year,
                "doc_url": doc_url,
            }

            # Construct file paths
            filename = f"annual_report_{year}.pdf"
            input_pdf_path = os.path.join(raw_company_directory, filename)
            output_pdf_path = os.path.join(prepared_company_directory, filename)

            current_metadata["local_pdf_path"] = output_pdf_path  # Store output file path in metadata

            if not pages:
                # If no pages are specified, copy the file as-is
                shutil.copyfile(input_pdf_path, output_pdf_path)
                metadata.append(current_metadata)  # Add metadata for the copied file
                continue

            # If pages are specified, process the relevant pages
            relevant_pages = doc_info["pages"]
            current_metadata["pages_kept"] = relevant_pages  # Record the kept pages in metadata

            # Extract and save the relevant pages
            keep_relevant_pages_in_pdf(input_pdf_path, output_pdf_path, relevant_pages)

            metadata.append(current_metadata)  # Add metadata for the processed file

    # Save metadata to a JSON file in the prepared directory
    save_json(metadata, os.path.join(prepared_base_directory, "metadata.json"))

    return True  # Indicate successful completion


In [None]:
prepared_base_directory = os.path.join(raw_base_directory, "prepared/")
prepared_base_directory

In [None]:
keep_relevant_pages_in_pdfs(raw_base_directory, prepared_base_directory, docs_mapping)


Interesting entities:

* Amazon annual report 2022:
    * Human capital - pg 15.
    * Risks - pg 17, 18.
    * Consolidated statements of cash flows millions - pg 47.
    * Consolidated statements of operations (in millions, except per share data) - pg 48
* Amazon annual report 2021:
    * Human capital - pg 14.
    * Risks - pg 16, 17, 18.
    * Consolidated statements of cash flows millions - pg 46.
    * Consolidated statements of operations (in millions, except per share data) - pg 47

In [None]:
prepared_base_directory

In [None]:
!ls {prepared_base_directory}

In [None]:
!cat {prepared_base_directory}/metadata.json | python -m json.tool