# Configure rclone (required to mount shared folder on GoogleDrive)

In [None]:
# Use the command below in this Jupyter notebook to install rclone if you haven't already:  
#!sudo apt-get install rclone # This worked on Lightning.ai but the rclone version is old
# Or use the below for installation on Codespaces (or on Lightning.ai) to install the latest rclone v1.68.2
#!curl https://rclone.org/install.sh | sudo bash  

In [1]:

import os
import subprocess
import sys

#===========================================================================
# Ensure the virtual environment is being used
#if sys.prefix != '/opt/venv':
#    raise EnvironmentError("The Jupyter notebook is not using the correct virtual environment.")
#===========================================================================


# Define the shared link and the remote name
shared_link = "https://drive.google.com/drive/folders/1wotNKTF97XG190vRVwwLH8k3U7XLoKI4?usp=sharing"
remote_name = "shared_gdrive"

# Define the token information   
# (Note: the token info should not be included in the rclone config file the first time setting it up for a new machine)

# On my Codespaces:
#token_info = '{"access_token":"ya29.a0ARW5m74G4rNiQqA1du9fWg5hOt3PKItS4LstJpoSyBbFv8LXfGSF92RDq9RJ3SpecT_gAYVAGmTdvsduRU8YFiIZM9KmxSZdNNdIZ9OD6ja-AAVW0eKcxaiJAR5Tb_bc5eupQ6j3HqVHM0Os5urgb8yYLh2-S7hmSzBaYkxPaCgYKAfsSARISFQHGX2Mi28KQ_4ZD3yGKUefNbmkJEA0175","token_type":"Bearer","refresh_token":"1//03QcJ-KAFdjVvCgYIARAAGAMSNwF-L9Ir_w9oMFLUQxMRakUPOkYCqldb_iLlqqAQ4oVzqNIJIANpTOCcP1GYiRY4f-DHDAYr7Ro"}'

# On my Lightning.ai: 
token_info = '{"access_token":"ya29.a0ARW5m74pgqLuFc1hun70t-Tdyq9MYbDEJB5x-purGrMyGwEh-AgzwvDUiF2cpLmooLK6qUkyBHwSVV_NeAR70ziN42cL9YZoDgb_Jssew6Anpyt6BCHM255F0ilCmSY7g1qnLB98tT0Mp4Y1cfGmBIKhPSa6TPCyps5wj_ibaCgYKAQgSARISFQHGX2MiaNed4lu935b5Qy3R64bygQ0175","token_type":"Bearer","refresh_token":"1//03TGP1XfkTzJkCgYIARAAGAMSNwF-L9Ir7ic8jwl7GtWw3YQsztP6XSK75VLWKBwkPELRbGeg_M1i_msP7u-Z2Niphn0rnljCj28","expiry":"2025-01-11T22:34:50.6605882Z"}'

# Create the rclone configuration file
rclone_config = f"""
[{remote_name}]
type = drive
scope = drive
root_folder_id = {shared_link.split('/')[-1].split('?')[0]}
token = {token_info}
"""
#token = {token_info} # Remove this line from the configuration file above the first time setting it up for a new machine)
#scope = drive.file # This is the scope 3 for accessing rclone-created files only 
#scope = drive # This is the scope 1 for accessing all files in Google Drive


# Write the configuration to the rclone config file
config_path = os.path.expanduser('~/.config/rclone/rclone.conf')
os.makedirs(os.path.dirname(config_path), exist_ok=True)
with open(config_path, 'w') as config_file:
    config_file.write(rclone_config)

print("Rclone configuration file created successfully.")

Rclone configuration file created successfully.


In [None]:
# Use a Terminal of VS Code to configure rclone (before using it to set a mount point)

# Open a terminal and run the following command to configure rclone interactively
# (if the token has not been obtained and included in the rclone config file yet)

# rclone config

# Follow these steps in the interactive configuration process (assuming rclone v1.68.2):

1. e, followed by 1 for 'shared_gdrive' (for existing remote already mentioned in configure file)  
   (Note: 13 for Google Drive when configuring a new remote)
2. Leave blank when asked about Client Id and Client Secret  
3. Choose scope 3 for  
       /Access to files created by rclone only.  
      | These are visible in the drive website.  
      | File authorization is revoked when the user deauthorizes the app.  
       \ (drive.file)  
4. Leave blank when asked about service_account_file 
5. Enter n when asked about 'Edit advanced config?'
6. Enter n when asked about 'Use web browser to automatically authenticate rclone with remote?' 
    (n if running rclone on a (remote) machine without web browser access, such as Lightning.ai or Codespaces)  

    You will need rclone available on a machine that has a web browser available.  
    For example, Fetch the correct binary for your processor type by clicking on these links.   
    If not sure, use the first link: Intel/AMD - 64 Bit (on https://rclone.org/install/#windows-precompiled)  
    Then extract rclone.exe, which is a portable executable that can be placed in any directory in your pc.  
    For more help and alternate methods see: https://rclone.org/remote_setup/  

    Execute the following on the machine with the web browser (same rclone version recommended):    
        `rclone authorize "drive" "..."` (where ... should be replaced by a token given by `rclone config` in the remote machine on cloud)
    Then paste the result at the 'config_token>' prompt.

    (*Note:* winnat of Windows 10 will block the rclone authorize command:
    https://github.com/rclone/rclone/issues/4688#issuecomment-1295481470

    Solution: Use 'Run as administrator' to open a Powershell. Then run the following commands:

    `net stop winnat`  
    `.\rclone authorize "drive" "..." (with ... replaced; see explanation earlier)`  
    `net start winnat`  
    
    )

7. Complete the configuration and save it.

In [None]:
# Install fuse3 (required in the next code chunk)

#!sudo apt update
#!sudo apt-get install fuse3

# Verify the installation:
#!fusermount3 --version

In [None]:
# Use rclone to mount the Google Drive shared folder as a local folder (after rclone config has been done)

import os
import subprocess

# Define the remote name for Google Drive
remote_name = "shared_gdrive"

# Create the local directory to mount the Google Drive folder
#mount_point = os.path.join(os.environ.get('PWD'), 'docArchive')
##mount_point = os.path.expanduser('~/XtractnParse/docArchive')


# Get the current working directory  
current_directory = os.getcwd()
print(f"current_directory: {current_directory}")

# Create the local directory to mount the Google Drive folder
mount_point = os.path.join(current_directory, 'docArchive')
print(f"Resolved mount point: {mount_point}")

# Check if you have write permissions to the current directory
#if not os.access(current_directory, os.W_OK):
#    raise PermissionError(f"No write permissions for the directory: {current_directory}")

# Create the directory if it does not exist
os.makedirs(mount_point, exist_ok=True)

# Mount the Google Drive shared folder using rclone
rclone_mount_command = [
    "rclone", "--vfs-cache-mode", "writes", "mount", f"{remote_name}:", mount_point
]

# Run the rclone command in the background
process = subprocess.Popen(rclone_mount_command)

# Check if the process is running
print(f"rclone process started with PID: {process.pid}")


current_directory: /workspaces/BBM104_dev/Session06
Resolved mount point: /workspaces/BBM104_dev/Session06/docArchive
rclone process started with PID: 7730


2025/03/02 04:38:04 CRITICAL: Fatal error: failed to mount FUSE fs: "/workspaces/BBM104_dev/Session06/docArchive" is not empty, use --allow-non-empty to mount anyway


In [None]:
# Use the following command (without ! if in a terminal) to unmount the drive previously mounted with rclone 
# (when this error appears: 'CRITICAL: Fatal error: failed to mount FUSE fs: directory already mounted' )

#!fusermount3 -uz ~/XtractnParse/docArchive   # Codespaces: replace ~/XtractnParse by /workspaces/BBM104_dev/Session06

# To remove the already unmounted empty local directory used for the mount point:
#!rmdir ~/XtractnParse/docArchive  # Codespaces: replace ~/XtractnParse by /workspaces/BBM104_dev/Session06

In [3]:
# Verify the mounted folder and create three subfolders (if not already created by rclone before)

import os
import subprocess

# Define the subfolders to be created
subfolders = ['DLdocs', 'pagesExtracted', 'Parsed']

# Create the subfolders if they do not already exist
for subfolder in subfolders:
    subfolder_path = os.path.join(mount_point, subfolder)
    os.makedirs(subfolder_path, exist_ok=True)
    print(f"Subfolder created or already exists: {subfolder_path}")

# List contents in the mounted Google Drive shared folder
subprocess.run(['ls', '-la', mount_point])
#!ls -la ~/XtractnParse/docArchive

# Path to the new file in the mounted Google Drive
file_path = os.path.join(mount_point, 'example.txt')

# Write to the file
with open(file_path, 'w') as file:
    file.write("This is an example file created in the mounted Google Drive folder.")

print(f"File 'example.txt' created at {file_path}")


Subfolder created or already exists: /workspaces/BBM104_dev/Session06/docArchive/DLdocs
Subfolder created or already exists: /workspaces/BBM104_dev/Session06/docArchive/pagesExtracted
Subfolder created or already exists: /workspaces/BBM104_dev/Session06/docArchive/Parsed
total 24
drwxrwxrwx+ 5 codespace codespace 4096 Mar  1 23:51 .
drwxrwxrwx+ 4 codespace root      4096 Mar  2 03:23 ..
drwxrwxrwx+ 3 codespace codespace 4096 Mar  2 03:24 DLdocs
-rw-rw-rw-  1 codespace codespace   67 Mar  2 03:11 example.txt
drwxrwxrwx+ 2 codespace codespace 4096 Mar  2 03:24 pagesExtracted
drwxrwxrwx+ 2 codespace codespace 4096 Mar  2 03:24 Parsed
File 'example.txt' created at /workspaces/BBM104_dev/Session06/docArchive/example.txt


---
---

# Read in .csv with pdf URL info

In [4]:
import pandas as pd
import ast
import requests
import os


reportList = 'annual_reports_with_sp500.csv'
# Set the 'year to download' parameter
yr2DL = 2023 # 2022 
pct2DL = 0.1    # .5


# Step 1: Read the CSV file into a pandas DataFrame
# On my Codespaces:
#df = pd.read_csv('./annual_reports_with_sp500.csv')
# On my Lightning.ai:
#df = pd.read_csv('~/XtractnParse/annual_reports_with_sp500.csv')
df = pd.read_csv(os.path.join(current_directory, reportList))  # annual_reports_with_sp500.csv'
#df = pd.read_csv('~/XtractnParse/annual_reports_with_sp500_short.csv')

# Step 2: Convert the string representation of the list in the `doc_links` column to an actual Python list for each row
df['doc_links'] = df['doc_links'].apply(ast.literal_eval)

# Display the data type of the 'doc_links' column
print("\nData type of 'doc_links' column:", df['doc_links'].dtype)

# Check the type of the first element in the 'doc_links' column
first_element_type = type(df['doc_links'].iloc[0])
print("\nType of the first element in 'doc_links' column:", first_element_type)

# Adjust pandas display options to show max colwidth
pd.set_option('display.max_colwidth', None)

# Display the dataFrame read in
print("DataFrame df:\n", df)



Data type of 'doc_links' column: object

Type of the first element in 'doc_links' column: <class 'list'>
DataFrame df:
                                         name ticker  \
0                     dallasnews corporation   DALN   
1                                 eneti inc.   NETI   
2                  agilent technologies inc.      A   
3                                 alcoa inc.     AA   
4                 american addiction centers    AAC   
...                                      ...    ...   
5499  zurn elkay water solutions corporation    ZWS   
5500                          zymeworks inc.   ZYME   
5501           zynerba pharmaceuticals, inc.   ZYNE   
5502                             zynex, inc.   ZYXI   
5503                     apogee therapeutics   apge   

                                industry            sector  \
0                Publishing - Newspapers          Services   
1                               Shipping  Industrial Goods   
2     Scientific & Technical Ins

# Create df_full to expand all elements of the full_urls list into separate rows

In [5]:
# Step 3: Create a new column 'full_urls' to store the full URLs
base_link = "https://www.annualreports.co.uk"
df['full_urls'] = df['doc_links'].apply(lambda links: [f"{base_link}{link}" for link in links])

# Display the DataFrame with the new 'full_urls' column
#print("DataFrame with full URLs:\n", df[['ticker', 'full_urls']])

# Step 4: Expand the 'full_urls' list into separate rows
df_full = df.explode('full_urls')

# Step 4.5: Add a new column 'sp500_ever' with value = 1 when 'sp500_start' is not empty
df_full['sp500_ever'] = df_full['sp500_start'].apply(lambda x: 1 if pd.notna(x) and x != '' else 0)

# Initialize 'DL' column to 0
df_full['DL'] = 0

# Initialize 'Analyzed' column to 0
df_full['Analyzed'] = 0
        
# Initialize 'Extracted' column to 0
df_full['Extracted'] = 0

# Initialize 'Parsed' column to 0
df_full['Parsed'] = 0

# Initialize 'Page Numbers with Frequencies' column to empty lists
df_full['Page Numbers with Frequencies'] = [[] for _ in range(len(df_full))]

# Step 5: Drop the undesired columns
df_full = df_full.drop(columns=['sp500_start', 'sp500_end', 'doc_links'])

# Display the expanded DataFrame
print("Expanded DataFrame:\n", df_full)


Expanded DataFrame:
                         name ticker                        industry  \
0     dallasnews corporation   DALN         Publishing - Newspapers   
0     dallasnews corporation   DALN         Publishing - Newspapers   
0     dallasnews corporation   DALN         Publishing - Newspapers   
0     dallasnews corporation   DALN         Publishing - Newspapers   
0     dallasnews corporation   DALN         Publishing - Newspapers   
...                      ...    ...                             ...   
5502             zynex, inc.   ZYXI  Medical Appliances & Equipment   
5502             zynex, inc.   ZYXI  Medical Appliances & Equipment   
5502             zynex, inc.   ZYXI  Medical Appliances & Equipment   
5502             zynex, inc.   ZYXI  Medical Appliances & Equipment   
5503     apogee therapeutics   apge                   Biotechnology   

          sector  \
0       Services   
0       Services   
0       Services   
0       Services   
0       Services   
...   

# Create df_sp500 for firms in S&P500 at least 1 year (created from df_full)

In [6]:
# Step 6: Create df_sp500 for rows with sp500_ever = 1
df_sp500 = df_full[df_full['sp500_ever'] == 1]

# Show the number of unique 'link' values in df_sp500
unique_links_count = df_sp500['link'].nunique()
print("Number of unique 'link' values in df_sp500:", unique_links_count)

# Display the DataFrame with sp500_ever = 1
print("DataFrame with sp500_ever = 1:\n", df_sp500)

Number of unique 'link' values in df_sp500: 630
DataFrame with sp500_ever = 1:
                            name ticker                            industry  \
2     agilent technologies inc.      A  Scientific & Technical Instruments   
2     agilent technologies inc.      A  Scientific & Technical Instruments   
2     agilent technologies inc.      A  Scientific & Technical Instruments   
2     agilent technologies inc.      A  Scientific & Technical Instruments   
2     agilent technologies inc.      A  Scientific & Technical Instruments   
...                         ...    ...                                 ...   
5491                zoetis inc.    ZTS          Drug Manufacturers - Major   
5491                zoetis inc.    ZTS          Drug Manufacturers - Major   
5491                zoetis inc.    ZTS          Drug Manufacturers - Major   
5491                zoetis inc.    ZTS          Drug Manufacturers - Major   
5491                zoetis inc.    ZTS          Drug Manufactu

# Download pdfs by year indicated in filename 

In [7]:
import os
import requests
import pandas as pd
import math


# Define the output directory in the mounted Google Drive
#output_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
output_dir = os.path.join(current_directory, 'docArchive/DLdocs')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Set the DataFrame to use for sampling
# Uncomment one of the following lines to choose the DataFrame
df_use = df_sp500
#df_use = df_full

# Drop rows with NaN values in 'full_urls' column
df_use = df_use.dropna(subset=['full_urls'])
# Create the string to search for
search_str = f"_{yr2DL}"
# Refine df_use to include only rows where 'full_urls' contains the search string
df_use = df_use[df_use['full_urls'].str.contains(search_str)]

# Display the total number of rows
print(f"Total number of rows: {df_use.shape[0]}")

# Set the number of rows for downloading
num2DL = math.ceil(pct2DL*df_use.shape[0])

# Display the refined DataFrame
print("DataFrame df_use for downloading:")
print(df_use)

# Define the path for the df_DL.csv file
df_DL_path = os.path.join(output_dir, 'df_DL.csv')

# Check if df_DL.csv exists
if os.path.exists(df_DL_path):
    # Read df_DL from the CSV file
    df_DL = pd.read_csv(df_DL_path)
else:
    # Create an empty df_DL with the same columns as df_use plus an additional DL column
    df_DL = pd.DataFrame(columns=list(df_use.columns))

for index, row in df_use.sample(n=num2DL).iterrows():  # pick 2 random rows
    ticker = row['ticker']
    full_url = row['full_urls']  # full_urls is now a single URL string
    
    file_name = full_url.split('/')[-1]
    
    # Display the full URL that is being used for download
    print(f"Downloading from URL: {full_url}")

    # Step 6: Download the PDF file
    response = requests.get(full_url)
    if response.status_code == 200:
        # Save the PDF file to the output directory
        output_path = os.path.join(output_dir, file_name)
        with open(output_path, 'wb') as file:
            file.write(response.content)
        
        # Update the DL column to 1 for the downloaded row
        row['DL'] = 1
        df_DL = pd.concat([df_DL, pd.DataFrame([row])], ignore_index=True)
    else:
        print(f"Failed to download {full_url}")

# Save the updated df_DL to the CSV file
df_DL.to_csv(df_DL_path, index=False)

# Display the updated df_DL
print("DataFrame df_DL with all rows:")
print(df_DL)

Total number of rows: 5
DataFrame df_use for downloading:
                         name ticker                    industry  \
1253     salesforce.com, inc.    CRM        Application Software   
2085         gen digital inc.    GEN        Application Software   
4438  signet jewelers limited    SIG              Jewelry Stores   
5150         v.f. corporation    VFC  Textile - Apparel Clothing   
5334              walmart inc    WMT    Discount, Variety Stores   

              sector  \
1253      Technology   
2085      Technology   
4438        Services   
5150  Consumer Goods   
5334  Consumer Goods   

                                                                 link  exchcd  \
1253        https://www.annualreports.co.uk/Company/salesforcecom-inc       1   
2085          https://www.annualreports.co.uk/Company/gen-digital-inc       3   
4438  https://www.annualreports.co.uk/Company/signet-jewelers-limited       1   
5150           https://www.annualreports.co.uk/Company/vf-corpor

# Turning the 'Option 1 Analyze' code into a function call

In [8]:
# Latest revised code

import string
import unicodedata
from collections import defaultdict
import os
import fitz  # PyMuPDF
import pandas as pd

def process_pdf(pdf_document):
    
    keywords = ['segment', 'segments']
    print(f"Keywords: {keywords}")

    # Initialize a dictionary to store the actual page numbers and the frequency of the keyword
    keyword_counts = defaultdict(int)

    # Function to normalize and clean words
    def clean_word(word):
        # Normalize the word to NFKD form
        normalized_word = unicodedata.normalize('NFKD', word)
        # Remove diacritics and special characters
        cleaned_word = ''.join(c for c in normalized_word if unicodedata.category(c) != 'Mn')
        # Remove punctuation
        cleaned_word = cleaned_word.translate(str.maketrans('', '', string.punctuation))
        # Remove non-ASCII characters
        cleaned_word = ''.join(c for c in cleaned_word if c in string.printable)
        return cleaned_word.lower()

    # Loop through each page in the PDF
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        
        # Extract the words from each page
        words = page.get_text("words")
        
        # ==================== Debugging ====================
        # Display a bit of the extracted words
        #words_snippet = ' '.join(word[4] for word in words[:20])  # Display the first 20 words
        #print(f"Page {page_num + 1} words snippet: {words_snippet}...")
        
        # Display the complete words of page 89
        #if page_num == 88:  # Page numbers are zero-indexed
        #    complete_words = ' '.join(word[4] for word in words)
        #    print(f"Complete words of page 89:\n{complete_words}")
        # ==================== Debugging ====================
        
        # Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
        count = sum(1 for word in words if any(keyword in clean_word(word[4]) for keyword in keywords))
        keyword_counts[page_num + 1] += count  # Store the actual page number (1-indexed)

            # Inaccurate check here: Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
            #count = sum(1 for word in words if clean_word(word[4]) in keywords)
            #keyword_counts[page_num + 1] += count  # Store the actual page number (1-indexed)

        # ==================== Debugging ====================
        # Debug: Print the count for each page
        print(f"Page {page_num + 1} count of detected keyword: {count}")
    
        # Additional debug: Print each word on page 89 after cleaning
        if page_num == 88:
            for word in words:
                cleaned_word = clean_word(word[4])
                print(f"Original Word: {word[4]}, Cleaned Word: {cleaned_word}")
        # ==================== Debugging ====================

    # Convert the dictionary to a pandas DataFrame
    data = {'Page Number': list(keyword_counts.keys()), 'Frequency': list(keyword_counts.values())}
    df = pd.DataFrame(data)

    # Sort the DataFrame by frequency in descending order
    df = df.sort_values(by='Frequency', ascending=False)

    # Display the DataFrame
    print("DataFrame after sorting:\n", df)

    # Filter the DataFrame to include only rows with non-zero frequencies
    df_non_zero = df[df['Frequency'] > 0]

    # Create a list of page numbers with frequencies
    page_numbers_with_frequencies = df_non_zero[['Page Number', 'Frequency']].values.tolist()

    return page_numbers_with_frequencies

#  Call process_pdf() to analyze the frequency of detected keywords in each page

In [9]:
# Debugged and keep

import os
import fitz  # PyMuPDF
import pandas as pd
import shutil

# Define the output directory in the mounted Google Drive
#output_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
output_dir = os.path.join(current_directory, 'docArchive/DLdocs')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define the directory paths
#pdf_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
pdf_dir = output_dir 
bad_pdf_dir = os.path.join(pdf_dir, 'bad')
os.makedirs(bad_pdf_dir, exist_ok=True)

# Define the path for the df_DL.csv file
df_DL_path = os.path.join(output_dir, 'df_DL.csv')

# Read df_DL from the CSV file
df_DL = pd.read_csv(df_DL_path)

#================================================
def has_proper_page_range(pdf_document):
    # Check if the first page number is 1
    first_page_label = pdf_document[0].get_label()
    print(f"First page label: {first_page_label}")  # Debug statement
    return first_page_label == "1"

def fix_page_range(pdf_document):
    # Create a new PDF with corrected page numbers
    new_pdf = fitz.open()
    for page_num in range(len(pdf_document)):
        new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
    return new_pdf
#================================================

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(output_dir) if f.endswith('.pdf')]

# Process each PDF file
for pdf_file in pdf_files:
    print(f"PDF file: {pdf_file}")
    pdf_path = os.path.join(output_dir, pdf_file)
    pdf_document = fitz.open(pdf_path)

    #================================================
    # Check if the PDF has a proper page range
    first_page_label = pdf_document[0].get_label()
    if first_page_label == "":
        print(f"First page label is empty for {pdf_file}")
        # Copy the original PDF document to the 'bad' subfolder with _nolabel suffix
        nolabel_pdf_path = os.path.join(bad_pdf_dir, pdf_file.replace('.pdf', '_nolabel.pdf'))
        shutil.copy(pdf_path, nolabel_pdf_path)
    elif not has_proper_page_range(pdf_document):
        print(f"Fixing page range for {pdf_file}")

        # Copy the original problematic PDF document to the 'bad' subfolder
        bad_pdf_path = os.path.join(bad_pdf_dir, pdf_file.replace('.pdf', '_bad.pdf'))
        shutil.copy(pdf_path, bad_pdf_path)

        # Fix the page range
        pdf_document = fix_page_range(pdf_document)

        # Overwrite the original problematic PDF document with the fixed PDF document
        pdf_document.save(pdf_path, incremental=False)
    #================================================
    
    # Process the PDF document and get the page numbers with frequencies
    page_numbers_with_frequencies = process_pdf(pdf_document)

    # Close the PDF document
    pdf_document.close()

    # Update df_DL
    df_DL.loc[df_DL['full_urls'].str.contains(pdf_file), 'Analyzed'] = 1
    df_DL.loc[df_DL['full_urls'].str.contains(pdf_file), 'Page Numbers with Frequencies'] = str(page_numbers_with_frequencies)

# Save the updated df_DL to the CSV file
df_DL.to_csv(df_DL_path, index=False)

# Display the updated df_DL
print("Updated df_DL DataFrame:")
print(df_DL)

PDF file: NASDAQ_GEN_2023.pdf
First page label is empty for NASDAQ_GEN_2023.pdf
Keywords: ['segment', 'segments']
Page 1 count of detected keyword: 0
Page 2 count of detected keyword: 0
Page 3 count of detected keyword: 0
Page 4 count of detected keyword: 0
Page 5 count of detected keyword: 1
Page 6 count of detected keyword: 0
Page 7 count of detected keyword: 0
Page 8 count of detected keyword: 0
Page 9 count of detected keyword: 0
Page 10 count of detected keyword: 0
Page 11 count of detected keyword: 0
Page 12 count of detected keyword: 0
Page 13 count of detected keyword: 0
Page 14 count of detected keyword: 0
Page 15 count of detected keyword: 0
Page 16 count of detected keyword: 0
Page 17 count of detected keyword: 0
Page 18 count of detected keyword: 0
Page 19 count of detected keyword: 0
Page 20 count of detected keyword: 0
Page 21 count of detected keyword: 0
Page 22 count of detected keyword: 0
Page 23 count of detected keyword: 0
Page 24 count of detected keyword: 0
Page 25

# Store extracted pages in local folder mounted to GoogleDrive

In [10]:
# Debugged and keep

import os
import pymupdf
import fitz  # PyMuPDF
import pandas as pd
import ast

# Assuming df_DL and file_name are already defined and populated

# Define the extraction directory in the mounted Google Drive
#extract_dir = os.path.expanduser('~/XtractnParse/docArchive/pagesExtracted')
extract_dir = os.path.join(current_directory, 'docArchive/pagesExtracted')
os.makedirs(extract_dir, exist_ok=True)

# Define the directory in the mounted Google Drive with downloaded PDFs
#pdf_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
pdf_dir = os.path.join(current_directory, 'docArchive/DLdocs')

# Define the path for the df_DL.csv file
df_DL_path = os.path.join(pdf_dir, 'df_DL.csv')
# Read df_DL from the CSV file
df_DL = pd.read_csv(df_DL_path)


# Iterate over the rows of df_DL
for index, row in df_DL.iterrows():
    # Check if the row should be analyzed
    if row['Analyzed'] == 1:
        # Extract the PDF filename from the full_urls column
        pdf_file_name = os.path.basename(row['full_urls'])
        pdf_file_path = os.path.join(pdf_dir, pdf_file_name)
        
        # Load the PDF document
        pdf_document = fitz.open(pdf_file_path)

        # Extract the page numbers with frequencies from the DataFrame
        page_numbers_with_frequencies_str = row['Page Numbers with Frequencies']
        
        # Convert the string back to a list of tuples
        page_numbers_with_frequencies = ast.literal_eval(page_numbers_with_frequencies_str)

        # Flag to track if all pages are extracted
        all_pages_extracted = True

        # Iterate over the list of page numbers and their frequencies (where the stored page numbers are 1-indexed)
        for page_number, frequency in page_numbers_with_frequencies:
            try:

                #==============================================
                # Set page number to be 0-indexed                
                page_zeroIdx = int(page_number) - 1   # Convert page_number to an integer
                # Create a new PDF with the extracted page
                new_pdf = pymupdf.open()                 # new empty PDF
                new_pdf.insert_pdf(pdf_document, to_page = page_zeroIdx)  # up to the page_zeroIdx page
                # Remove all pages except the last one
                if new_pdf.page_count > 1:
                    new_pdf.delete_pages(range(new_pdf.page_count - 1))  # Delete all pages except the last one
                # Now new_pdf contains only the last page
                #==============================================

                # Define the output file name
                output_file_name = f"{pdf_file_name[:-4]}_[{page_number},{frequency}].pdf"
                output_file_path = os.path.join(extract_dir, output_file_name)
                
                # Save the new PDF
                new_pdf.save(output_file_path)
                new_pdf.close()
            except Exception as e:
                # If any page extraction fails, set the flag to False
                all_pages_extracted = False
                print(f"Failed to extract page {page_number} from {pdf_file_name}: {e}")

        # Close the original PDF document
        pdf_document.close()

        # Update the 'Extracted' column if all pages were successfully extracted
        if all_pages_extracted:
            df_DL.at[index, 'Extracted'] = 1

# Save the updated DataFrame to the corresponding .csv file
df_DL.to_csv(df_DL_path, index=False)

print("Pages extracted and saved successfully.")

print(df_DL)


Pages extracted and saved successfully.
               name ticker              industry      sector  \
0  gen digital inc.    GEN  Application Software  Technology   

                                                      link  exchcd  \
0  https://www.annualreports.co.uk/Company/gen-digital-inc       3   

                                                                              full_urls  \
0  https://www.annualreports.co.uk/HostedData/AnnualReportArchive/g/NASDAQ_GEN_2023.pdf   

   sp500_ever  DL  Analyzed  Extracted  Parsed  \
0           1   1         1          1       0   

                                                                                 Page Numbers with Frequencies  
0  [[121, 8], [127, 3], [207, 2], [33, 1], [5, 1], [138, 1], [178, 1], [183, 1], [162, 1], [128, 1], [182, 1]]  


---
# List pdf pages in the extraction directory for parsing

In [11]:
import os
from llama_parse import LlamaParse

# Define the extraction directory in the mounted Google Drive
#extract_dir = os.path.expanduser('~/XtractnParse/docArchive/pagesExtracted')
extract_dir = os.path.join(current_directory, 'docArchive/pagesExtracted')
os.makedirs(extract_dir, exist_ok=True)

# List all PDF files in the extraction directory
pdf_pages = [f for f in os.listdir(extract_dir) if f.endswith('.pdf')]

total_pages = len(pdf_pages)
print(f"Total number of PDF pages in the extraction directory: {total_pages}")

# Set the number of PDF pages to parse in the extraction directory
num2Parse = total_pages  # 2
print(f"PDF pages to parse: First {num2Parse} pages")


# Accurate mode: Parser options defined
parser = LlamaParse(
    api_key="llx-rXLBNDYeD7iO9JhcSqvhtmZvSnQHhsxtoEmvILcfZ56ig9Fh", # of a.yim@city.ac.uk # (your LlamaParse api key here)
    invalidate_cache=True,
    #premium_mode=True,  #Accurate_mode is the default
    parsing_instruction = "The provided pdf document is extracted from the Notes to financial statements in a company annual report. The document contains tables (with financial amounts) and texts from a note on segment reporting. Each page of the pdf document has a footer (with the company name and the year of the report, followed by the page number) and a header (with the section name of the company annual report). Some tables may have table footnotes that are numbered. The table footnotes explain details about certain items in the table that are tagged with superscript numbers. Please convert the pdf document into a markdown document with an equivalent page formatting as similar to the original pdf document as possible. Make sure you convert heading to heading and paragraph to paragraph with exact words and table to table with exact structure of rows and columns.",
    is_formatting_instruction=True,
    #disable_ocr=False,#True,
    skip_diagonal_text=True,
    do_not_unroll_columns=False, # unroll columns
    #target_pages="0,2,7",  # Pages are numbered starting at 0
    bounding_box="0.08,0,0.08,0",
    take_screenshot=True,
    #disable_image_extraction=False,#True,
    result_type="markdown",  # "markdown" and "text" are available
)



    # Define the directory in the mounted Google Drive with downloaded PDFs
    #pdf_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
    # Define the path for the df_DL.csv file
    #df_DL_path = os.path.join(pdf_dir, 'df_DL.csv')
    # Read df_DL from the CSV file
    #df_DL = pd.read_csv(df_DL_path)


#print(f"PDF files in the extraction directory: {pdf_pages}")
for pdf_page in pdf_pages:
    print(f"PDF file in the extraction directory: {pdf_page}")
    

Total number of PDF pages in the extraction directory: 11
PDF pages to parse: First 11 pages
PDF file in the extraction directory: NASDAQ_GEN_2023_[178,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[128,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[207,2].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[121,8].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[162,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[182,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[138,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[183,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[33,1].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[127,3].pdf
PDF file in the extraction directory: NASDAQ_GEN_2023_[5,1].pdf


# Parse the first num2Parse extracted pages in the full list

In [12]:
# Debugged and keep

import os
import subprocess

# Set myFontSize for generating the PDF file from the markdown file
myFontSize = '10pt'   # '10pt' worked well; '10.5pt' or larger gives a warning  

# Ensure pandoc is installed
try:
    subprocess.run(['pandoc', '--version'], check=True)
except subprocess.CalledProcessError:
    print("Pandoc not found. Please install pandoc.")
    raise SystemExit("Pandoc is required to run this script.")

# Ensure pdfroff is installed
pdf_engine = 'pdfroff'
try:
    subprocess.run([pdf_engine, '--version'], check=True)
except subprocess.CalledProcessError:
    print("pdfroff not found. Please install pdfroff.")
    print("You can install pdfroff using the following command:")
    print("sudo apt-get install pandoc groff ghostscript")
    raise SystemExit("pdfroff is required to run this script.")

# Define the extraction and parse directories in the mounted Google Drive
#extract_dir = os.path.expanduser('~/XtractnParse/docArchive/pagesExtracted')
extract_dir = os.path.join(current_directory, 'docArchive/pagesExtracted')
#parse_dir = os.path.expanduser('~/XtractnParse/docArchive/Parsed')
parse_dir = os.path.join(current_directory, 'docArchive/Parsed')
os.makedirs(extract_dir, exist_ok=True)
os.makedirs(parse_dir, exist_ok=True)


# Process the first 'num2Parse' PDF pages of the list in the extraction directory
for pdf_page in pdf_pages[:num2Parse]:
    print(f"PDF page file: {pdf_page}")
    page_path = os.path.join(extract_dir, pdf_page)
    docs = await parser.aload_data(page_path)
#    print(docs[0].text) # uncomment this for review
    
    # Remove the opening and closing markdown lines
    text = docs[0].text.strip('```markdown').strip('```')
    
    # Save the cleaned text to a markdown file with the same filename as pdf_page but with a .md extension
    input_md_path = os.path.join(parse_dir, os.path.splitext(pdf_page)[0] + '.md')
    with open(input_md_path, 'w', encoding='utf-8') as md_file:
        md_file.write(text)

    # Generate a PDF copy of the markdown file using pandoc via subprocess
    pdf_path = os.path.join(parse_dir, os.path.splitext(pdf_page)[0] + '_md.pdf')
    subprocess.run([
        'pandoc', input_md_path, '-o', pdf_path, 
        f'--pdf-engine={pdf_engine}', 
        '-V', f'fontsize={myFontSize}',  # Specify the font size here
        '-V', 'hyphenate=false'  # Disable auto-hyphenation
    ], check=True)


pandoc 3.1.3
Features: -server +lua
Scripting engine: Lua 5.4
User data directory: /home/codespace/.local/share/pandoc
Copyright (C) 2006-2023 John MacFarlane. Web: https://pandoc.org
This is free software; see the source for copying conditions. There is no
warranty, not even for merchantability or fitness for a particular purpose.
GNU groff version 1.23.0
Copyright (C) 2022 Free Software Foundation, Inc.
GNU groff comes with ABSOLUTELY NO WARRANTY.
You may redistribute copies of groff and its subprograms
under the terms of the GNU General Public License.
For more information about these matters, see the file
named COPYING.

called subprograms:

GNU grops (groff) version 1.23.0
GNU troff (groff) version 1.23.0
PDF page file: NASDAQ_GEN_2023_[178,1].pdf


GNU pdfroff (groff) version 1.23.0



Started parsing the file under job_id eb694a75-4cb6-462d-b209-da7f1474ac41
PDF page file: NASDAQ_GEN_2023_[128,1].pdf
Started parsing the file under job_id 02ab38cc-5d44-41d0-8d50-c5b650cbba1e
PDF page file: NASDAQ_GEN_2023_[207,2].pdf
Started parsing the file under job_id ec203ff3-e884-45cb-a0be-559469f6dbff
PDF page file: NASDAQ_GEN_2023_[121,8].pdf
Started parsing the file under job_id c2d6a2ee-3ef9-45d0-89a4-d4f169b02db5
PDF page file: NASDAQ_GEN_2023_[162,1].pdf
Started parsing the file under job_id a3fb6ba5-1fee-49b0-b8a0-1b51e74531a6
PDF page file: NASDAQ_GEN_2023_[182,1].pdf
Started parsing the file under job_id 81ef3594-0622-43c4-ac89-785dd59377f0
.....

CancelledError: 

In [24]:
print(pdf_pages)

['NYSE_CRM_2023_[88,5].pdf', 'NYSE_CRM_2023_[58,3].pdf', 'NYSE_CRM_2023_[87,3].pdf', 'NYSE_CRM_2023_[32,1].pdf']


---
---

# 2beDel -- Option 1: Extract words from each page; Count detected keywords

In [10]:
# old original Option 1 code

import string
import unicodedata
from collections import defaultdict
import pandas as pd

# Assuming pdf_document is already defined and loaded

keywords = ['segment', 'segments']
print(f"Keywords: {keywords}")

# Step 3: Initialize a dictionary to store the page numbers and the frequency of the keyword
keyword_counts = defaultdict(int)

# Function to normalize and clean words
def clean_word(word):
    # Normalize the word to NFKD form
    normalized_word = unicodedata.normalize('NFKD', word)
    # Remove diacritics and special characters
    cleaned_word = ''.join(c for c in normalized_word if unicodedata.category(c) != 'Mn')
    # Remove punctuation
    cleaned_word = cleaned_word.translate(str.maketrans('', '', string.punctuation))
    # Remove non-ASCII characters
    cleaned_word = ''.join(c for c in cleaned_word if c in string.printable)
    return cleaned_word.lower()


# Step 4: Loop through each page in the PDF
for page_num in range(len(pdf_document)):
    page = pdf_document.load_page(page_num)
    
    # Step 5: Extract the words from each page
    words = page.get_text("words")
    
    # Display a bit of the extracted words
    words_snippet = ' '.join(word[4] for word in words[:20])  # Display the first 20 words
    print(f"Page {page_num + 1} words snippet: {words_snippet}...")
    
    # Display the complete words of page 22
    if page_num == 21:  # Page numbers are zero-indexed
        complete_words = ' '.join(word[4] for word in words)
        print(f"Complete words of page 22:\n{complete_words}")
    
    # Step 6: Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
    count = sum(1 for word in words if clean_word(word[4]) in keywords)
    keyword_counts[page_num] += count

    # Debug: Print the count for each page
    print(f"Page {page_num + 1} count of detected keyword: {count}")

    # Additional debug: Print each word on page 299 after cleaning
    if page_num == 21:
        for word in words:
            cleaned_word = clean_word(word[4])
            print(f"Original Word: {word[4]}, Cleaned Word: {cleaned_word}")


Keywords: ['segment', 'segments']
Page 1 words snippet: Table of Contents UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, DC 20549 Form 10-K (Mark One) ☒ ANNUAL REPORT PURSUANT...
Page 1 count of detected keyword: 0
Page 2 words snippet: Table of Contents i TABLE OF CONTENTS Item No. Page No. PART I Item 1. Business 1 Item 1A.Risk Factors...
Page 2 count of detected keyword: 0
Page 3 words snippet: Table of Contents 1 PART I Forward-looking statements This Form 10-K contains forward-looking statements within the meaning of Section 21E...
Page 3 count of detected keyword: 0
Page 4 words snippet: Table of Contents 2 indications, including, initially, two key rare disease target indications: Short bowel syndrome (“SBS”) with intestinal failure...
Page 4 count of detected keyword: 0
Page 5 words snippet: Table of Contents 3 Crofelemer powder for oral solution is being developed to support orphan or rare disease indications for...
Page 5 count of detected keyword: 0
Page 6 word

# 2beConverted -- Option 2: Extract words from each page; Count detected keywords
### (!!!Note: Use OCR with Tesseract when a page does not seem to be text-searchable)

In [None]:
#!sudo apt-get install tesseract-ocr
#!pip install pytesseract nltk

In [32]:

import nltk
from nltk.corpus import words as nltk_words
# Load the set of English words
nltk.download('words')
english_words = set(nltk_words.words())

#================================================================================

import re
import string
import unicodedata
from collections import defaultdict
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io


# Assuming pdf_document is already defined and loaded

keywords = ['segment', 'segments']
print(f"Keywords: {keywords}")

# Step 3: Initialize a dictionary to store the page numbers and the frequency of the keyword
keyword_counts = defaultdict(int)

# Define a regular expression pattern for numerical figures with the specified currency symbols and thousand separators
numerical_pattern = re.compile(r'^(?:[\$£€]|kr|SEK|DKK|NOK|CZK|PLN|HUF|RON|BGN|ISK|CHF)?\d{1,3}(?:,\d{3})*(\.\d+)?%?$')

# \d{1,3}(?:,\d{3})*: This matches one to three digits optionally followed by groups of three digits separated by commas.
# (\.\d+)?: This matches an optional decimal point followed by one or more digits.


# Function to normalize and clean words
def clean_word(word):
    # Normalize the word to NFKD form
    normalized_word = unicodedata.normalize('NFKD', word)
    # Remove diacritics
    cleaned_word = ''.join(c for c in normalized_word if unicodedata.category(c) != 'Mn')
    # Remove punctuation selectively
    cleaned_word = re.sub(r'(?<!\d)[.,%](?!\d)', '', cleaned_word)
    # Remove other punctuation
    cleaned_word = cleaned_word.translate(str.maketrans('', '', string.punctuation.replace('.', '').replace(',', '').replace('%', '')))
    # Remove non-ASCII characters
    cleaned_word = ''.join(c for c in cleaned_word if c in string.printable)
    return cleaned_word.lower()

# Function to extract text using OCR with Tesseract configuration for small fonts
def ocr_extract_text(page):
    # Increase the DPI of the image
    zoom = 2  # Increase the zoom factor to get a higher resolution image
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    
    img = Image.open(io.BytesIO(pix.tobytes()))
    custom_config = r'--oem 3 --psm 6 -l eng'  # Use LSTM OCR Engine, assume a single uniform block of text, and English language
    text = pytesseract.image_to_string(img, config=custom_config)
    return text

# Function to check if a word is meaningful
def is_meaningful(word):
    cleaned_word = clean_word(word)
    # Check if the word is alphabetic and in the English words set
    if cleaned_word.isalpha() and cleaned_word in english_words:
        return True
    # Check if the word matches the numerical pattern
    if numerical_pattern.match(word):
        return True
    return False

# Step 4: Loop through each page in the PDF
for page_num in range(len(pdf_document)):  #range(min(30, len(pdf_document))): 
    page = pdf_document.load_page(page_num)
    
    # Step 5: Extract the words from each page
    words = page.get_text("words")
    
    if not words:
        # If no words are extracted, use OCR
        text = ocr_extract_text(page)
        words = [(0, 0, 0, 0, word) for word in text.split()]
    
    # Check if most words are meaningless
    if words:
        meaningful_count = sum(1 for word in words if is_meaningful(word[4]))
        total_count = len(words)
        if meaningful_count / total_count < 0.5:  # Threshold can be adjusted
            # If less than 50% of the words are meaningful, use OCR
            text = ocr_extract_text(page)
            words = [(0, 0, 0, 0, word) for word in text.split()]
    
    # Display a bit of the extracted words
    words_snippet = ' '.join(word[4] for word in words[:20])  # Display the first 20 words
    print(f"Page {page_num + 1} words snippet: {words_snippet}...")
    
    page_in_concern = 21
    # Display the complete words of {page_in_concern+1}
    if page_num == page_in_concern:  # Page numbers are zero-indexed
        complete_words = ' '.join(word[4] for word in words)
        print(f"Complete words of {page_in_concern+1}:\n{complete_words}")
    
    # Step 6: Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
    count = sum(1 for word in words if clean_word(word[4]) in keywords)
    keyword_counts[page_num] += count
    
    # Debug: Print the count for each page
    print(f"Page {page_num + 1} count of detected keyword: {count}")

    # Additional debug: Print each word on the page in concern after cleaning
    if page_num == page_in_concern:
        for word in words:
            cleaned_word = clean_word(word[4])
            print(f"Original Word: {word[4]}, Cleaned Word: {cleaned_word}")



[nltk_data] Downloading package words to /home/codespace/nltk_data...
[nltk_data]   Package words is already up-to-date!


Keywords: ['segment', 'segments']
Page 1 words snippet: Morningstar® Document Research℠ FORM 10-K EXCO RESOURCES INC - XCOOQ Filed: March 15, 2018 (period: December 31, 2017) Annual report...
Page 1 count of detected keyword: 0
Page 2 words snippet: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K þ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d)...
Page 2 count of detected keyword: 0
Page 3 words snippet: EXCO RESOURCES, INC. TABLE OF CONTENTS PART I. Item 1. Business 2 Item 1A. Risk Factors 29 Item 1B. Unresolved...
Page 3 count of detected keyword: 0
Page 4 words snippet: EXCO RESOURCES, INC. PART I Item 1. Business General Unless the context requires otherwise, references in this Annual Report on...
Page 4 count of detected keyword: 0
Page 5 words snippet: among other things, permit the issuance of the 1.5 Lien Notes and the exchanges of Second Lien Term Loans, reduce...
Page 5 count of detected keyword: 0
Page 6 words snippet: Our strengths High qual

---

# Accurate mode: Parser options defined

In [None]:
from llama_parse import LlamaParse

# define parser options
parser = LlamaParse(
    api_key="llx-rXLBNDYeD7iO9JhcSqvhtmZvSnQHhsxtoEmvILcfZ56ig9Fh", # of a.yim@city.ac.uk # (your LlamaParse api key here)
    invalidate_cache=True,
    #premium_mode=True,  #Accurate_mode is the default
    parsing_instruction = "The provided pdf document is extracted from the Notes to financial statements in a company annual report. The document contains tables (with financial amounts) and texts from a note on segment reporting. Each page of the pdf document has a footer (with the company name and the year of the report, followed by the page number) and a header (with the section name of the company annual report). Some tables may have table footnotes that are numbered. The table footnotes explain details about certain items in the table that are tagged with superscript numbers. Please convert the pdf document into a markdown document with an equivalent page formatting as similar to the original pdf document as possible. Make sure you convert heading to heading and paragraph to paragraph with exact words and table to table with exact structure of rows and columns.",
    is_formatting_instruction=True,
    #disable_ocr=False,#True,
    skip_diagonal_text=True,
    do_not_unroll_columns=False, # unroll columns
    #target_pages="0,2,7",  # Pages are numbered starting at 0
    bounding_box="0.08,0,0.08,0",
    take_screenshot=True,
    #disable_image_extraction=False,#True,
    result_type="markdown",  # "markdown" and "text" are available
)

#pdf_path = os.path.join(current_dir, 'XtractnParse', 'PwC2024_segment.pdf')
#docs = await parser.aload_data(pdf_path)
#docs = await parser.aload_data("4481ea38b08ecea7dae84e4c698da1fc802a0bc6.pdf")
docs = await parser.aload_data("XtractnParse/docs4Parsing/PwC2024_segment.pdf")
print(docs[0].text)


---
# Premium mode: Parser options defined

In [None]:
from llama_parse import LlamaParse

parser = LlamaParse(
    api_key="llx-rXLBNDYeD7iO9JhcSqvhtmZvSnQHhsxtoEmvILcfZ56ig9Fh", # of a.yim@city.ac.uk # (your LlamaParse api key here)
    invalidate_cache=True,
    premium_mode=True,  #Accurate_mode is the default
    parsing_instruction = "The provided pdf document is extracted from the Notes to financial statements in a company annual report. The document contains tables (with financial amounts) and texts from a note on segment reporting. Each page of the pdf document has a footer (with the company name and the year of the report, followed by the page number) and a header (with the section name of the company annual report). Some tables may have table footnotes that are numbered. The table footnotes explain details about certain items in the table that are tagged with superscript numbers. Please convert the pdf document into a markdown document with an equivalent page formatting as similar to the original pdf document as possible. Make sure you convert heading to heading and paragraph to paragraph with exact words and table to table with exact structure of rows and columns.",
    is_formatting_instruction=True,
    #disable_ocr=False,#True,
    skip_diagonal_text=True,
    do_not_unroll_columns=False, # unroll columns
    #target_pages="0,2,7",  # Pages are numbered starting at 0
    bounding_box="0.08,0,0.08,0",
    take_screenshot=True,
    #disable_image_extraction=False,#True,
    result_type="markdown",  # "markdown" and "text" are available
)

#pdf_path = os.path.join(current_dir, 'XtractnParse', 'PwC2024_segment.pdf')
#docs = await parser.aload_data(pdf_path)
#docs = await parser.aload_data("4481ea38b08ecea7dae84e4c698da1fc802a0bc6.pdf")
docs = await parser.aload_data("XtractnParse/docs4Parsing/Nextplc2008_segment.pdf")
print(docs[0].text)
#docs = await parser.aload_data("4481ea38b08ecea7dae84e4c698da1fc802a0bc6.pdf")
#print(docs[0].text)
