# Download pdfs and extract pages with the specified keywords

In [1]:
#==============================================================================
# Parameters adjustable by the user
#==============================================================================
reportList = 'pdfscreenedURLs.csv'          # Contains the list of reports to download
#reportList = 'annual_reports_with_sp500.csv'   # Contains the list of reports to download
col_name = 'URL' # Column name in the reportList file that contains the URLs

# Set parameter for the 'year to download' 
years = 2019 # 2023 # 2022  

# Set keywords to search for in a report  
keywords = ['water'] # ['air', 'water']  # ['pollution']  # ['segment', 'segments']   

# Set parameter for the 'percentage (in decimal point) of reports to download' 
pct2DL = 1    # .5    
# [For debugging] Set parameter for reporting details of the page numbered 'DetailedPage'
DetailedPage = 1   # Note: Python is 0-indexed; so DetailedPage = 1 means the second page of the report

#==============================================================================
# Unused parameters
#==============================================================================
#yearEnd = 2023
#yearStart = 2014
#years = [yearStart - i for i in range(yearStart - yearEnd + 1)]
#==============================================================================

# Prepare folders for storing downloaded and processed documents

- Identify the repository name
- Create folder docArchive in the repo directory and its subfolders
- List the folder contents to confirm its creation of the subfolders

In [2]:
# Import the required libraries
import os
import subprocess
from urllib.parse import urljoin, urlparse

# Print all environment variables
#for key, value in os.environ.items():
#    print(f'{key}={value}')

# Get the current working directory and create a local directory there
current_directory = os.getcwd()
print(f"current_directory: {current_directory}\n")


#==================================================================================
# Get the value of the RepositoryName environment variable
repository_name = os.getenv('RepositoryName')

# Find the position of the repository name in the current directory path
repo_index = current_directory.find(repository_name)

# Retain only the part of the path up to and including the repository name
if repo_index != -1:
    repo_dir = current_directory[:repo_index + len(repository_name)]
else:
    repo_dir = current_directory

print(f"Repo directory: {repo_dir}\n")
#==================================================================================


# Path to the document directory  
doc_dir = os.path.join(repo_dir, 'docArchive')
# Create the directory if it does not exist
os.makedirs(doc_dir, exist_ok=True)


# Create three subfolders (if not already created)
subfolders = ['DLdocs', 'pagesExtracted', 'Parsed'] # Define the subfolders to be created
# Create the subfolders if they do not already exist
for subfolder in subfolders:
    subfolder_path = os.path.join(doc_dir, subfolder)
    os.makedirs(subfolder_path, exist_ok=True)
    print(f"Subfolder created or already exists: {subfolder_path}")
print(" ")

# List contents in the document directory
subprocess.run(['ls', '-la', doc_dir])

current_directory: /workspaces/BBM104/Session06

Repo directory: /workspaces/BBM104

Subfolder created or already exists: /workspaces/BBM104/docArchive/DLdocs
Subfolder created or already exists: /workspaces/BBM104/docArchive/pagesExtracted
Subfolder created or already exists: /workspaces/BBM104/docArchive/Parsed
 
total 20
drwxrwxrwx+ 5 codespace codespace 4096 Mar  6 02:18 .
drwxrwxrwx+ 8 codespace root      4096 Mar  6 02:18 ..
drwxrwxrwx+ 2 codespace codespace 4096 Mar  6 02:18 DLdocs
drwxrwxrwx+ 2 codespace codespace 4096 Mar  6 02:18 pagesExtracted
drwxrwxrwx+ 2 codespace codespace 4096 Mar  6 02:18 Parsed


CompletedProcess(args=['ls', '-la', '/workspaces/BBM104/docArchive'], returncode=0)

---
---

# Read in .csv with pdf URL info

In [3]:
import pandas as pd
import ast
import requests
import os


# Step 1: Read the CSV file into a pandas DataFrame
df = pd.read_csv(os.path.join(repo_dir, reportList))   

# Rename the column with the name in col_name to 'full_urls'
df.rename(columns={col_name: 'full_urls'}, inplace=True)

# Display the data type of the 'full_urls' column
print("\nData type of 'full_urls' column:", df['full_urls'].dtype)

# Check the type of the first element in the 'full_urls' column
first_element_type = type(df['full_urls'].iloc[0])
print("\nType of the first element in 'full_urls' column:", first_element_type)

# Adjust pandas display options to show max colwidth
pd.set_option('display.max_colwidth', None)

# Create a copy of the DataFrame for further processing
# Note: This ensures the read-in DataFrame, df, is not altered in subsequent processing
df_use0 = df.copy()  # pandas's method to create a deep copy; to create a shallow copy, use .copy(deep=False)

# Display the dataFrame used for further processing
print("DataFrame df:\n", df_use0)


Data type of 'full_urls' column: object

Type of the first element in 'full_urls' column: <class 'str'>
DataFrame df:
                                                                                                                                                                  full_urls
0                                     https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2016.pdf
1                                     https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2017.pdf
2                                     https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2018.pdf
3                                     https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2019.pdf
4                        

# Prepare the list of pdfs for download

- include only pdfs with URLs containing the search string

In [4]:
import os
import requests
import pandas as pd
import math


# Define the output directory
output_dir = os.path.join(repo_dir, 'docArchive/DLdocs')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Drop rows with NaN values in 'full_urls' column
df_use = df_use0.dropna(subset=['full_urls'])


# Create the string to search for
search_str = f"{years}"   # e.g., '2002'
# Display search string used
print(f"Search string: {search_str}")


# Refine df_use to include only rows where 'full_urls' contains the search string
df_use = df_use[df_use['full_urls'].str.contains(search_str)]

# Display the total number of rows
print(f"Total number of rows: {df_use.shape[0]}")


# Display the refined DataFrame
print("DataFrame df_use for downloading:")
print(df_use)



Search string: 2019
Total number of rows: 2
DataFrame df_use for downloading:
                                                                                                                               full_urls
3   https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2019.pdf
10                               https://group.mercedes-benz.com/documents/sustainability/reports/daimler-sustainability-report-2019.pdf


# Download pdfs by year indicated in filename 

In [5]:
import os
import requests
import pandas as pd
import math


# Define function to get the filename (incl. the extension) from the URL
#   to be used for saving the downloaded file
def get_filenameDOText(url):
    segments = urlparse(url).path.split('/')
    last_segment = segments[-1]
#    if '.' in last_segment:
#        return last_segment.split('.')[0]
    return last_segment


# Define the path for the df_DL.csv file
df_DL_path = os.path.join(output_dir, 'df_DL.csv')


# Read in/Create df_DL.csv for keeping track of downloaded files
if os.path.exists(df_DL_path):
    # Read df_DL from the CSV file
    df_DL = pd.read_csv(df_DL_path)
else:
    # Create an empty df_DL with the same columns as df_use plus an additional DL column
    df_DL = pd.DataFrame(columns=list(df_use.columns) + ['DL'])


# Set the number of rows for downloading
num2DL = math.ceil(pct2DL*df_use.shape[0])


for index, row in df_use.sample(n=num2DL).iterrows():  # pick n random rows
    full_url = row['full_urls']  # full_urls is now a single URL string
            
    # Check if the URL is already in df_DL
    if full_url in df_DL['full_urls'].values:
        print(f"URL already downloaded: {full_url}")
        continue

    # Display the full URL that is being used for download
    print(f"Downloading from URL: {full_url}")

    # Step 6: Download the PDF file
    response = requests.get(full_url)
    if response.status_code == 200:
        # Save the PDF file to the output directory
        output_path = os.path.join(output_dir, get_filenameDOText(full_url))
        with open(output_path, 'wb') as file:
            file.write(response.content)
        
        # Update the DL column to 1 for the downloaded row
        row['DL'] = 1
        df_DL = pd.concat([df_DL, pd.DataFrame([row])], ignore_index=True)
    else:
        print(f"Failed to download {full_url}")

# Save the updated df_DL to the CSV file
df_DL.to_csv(df_DL_path, index=False)

# Display the updated df_DL
print("DataFrame df_DL with all rows:")
print(df_DL)

Downloading from URL: https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2019.pdf
Downloading from URL: https://group.mercedes-benz.com/documents/sustainability/reports/daimler-sustainability-report-2019.pdf


DataFrame df_DL with all rows:
                                                                                                                              full_urls  \
0  https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2019.pdf   
1                               https://group.mercedes-benz.com/documents/sustainability/reports/daimler-sustainability-report-2019.pdf   

  DL  
0  1  
1  1  


---
---

# Define function to extract words from each page and count detected keywords  

### Option 1 here for text-searchable pdfs
### (see Option 2 later for using the OCR approach)

In [6]:

import string
import unicodedata
from collections import defaultdict
import os
import fitz  # PyMuPDF
import pandas as pd

def process_pdf(pdf_document, keywords, DetailedPage):
    
    print(f"Keywords: {keywords}\n")

    # Initialize a dictionary to store the actual page numbers and the frequency of the keyword
    keyword_counts = defaultdict(int)

    # Function to normalize and clean words
    def clean_word(word):
        # Normalize the word to NFKD form
        normalized_word = unicodedata.normalize('NFKD', word)
        # Remove diacritics and special characters
        cleaned_word = ''.join(c for c in normalized_word if unicodedata.category(c) != 'Mn')
        # Remove punctuation
        cleaned_word = cleaned_word.translate(str.maketrans('', '', string.punctuation))
        # Remove non-ASCII characters
        cleaned_word = ''.join(c for c in cleaned_word if c in string.printable)
        return cleaned_word.lower()

    # Loop through each page in the PDF
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        
        # Extract the words from each page
        words = page.get_text("words")
        
        # ==================== Debugging ====================
        # Display a bit of the extracted words
        #words_snippet = ' '.join(word[4] for word in words[:20])  # Display the first 20 words
        #print(f"Page {page_num + 1} words snippet: {words_snippet}...")
        
        # Display the complete words of page 'DetailedPage'
        #if page_num == DetailedPage:  # Page numbers are zero-indexed
        #    complete_words = ' '.join(word[4] for word in words)
        #    print(f"Complete words of page {DetailedPage}:\n{complete_words}")
        # ==================== Debugging ====================
        
        # Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
        count = sum(1 for word in words if any(keyword in clean_word(word[4]) for keyword in keywords))
        keyword_counts[page_num + 1] += count  # Store the actual page number (1-indexed)

        # ==================== Debugging ====================
        # Debug: Print the count for each page
        #print(f"Page {page_num + 1}'s count of detected keyword(s): {count}\n")
    
        # Additional debug: Print each word on page 'DetailedPage' after cleaning
        if page_num == DetailedPage:
            # Debug: Print the count for each page
            print(f"Page {page_num + 1}'s count of detected keyword(s): {count}\n")
            print(f"Displaying only first 15 or less of ...")
            for word in words[:15]:  # Limit to the first 15 words
                cleaned_word = clean_word(word[4])
                print(f"Original Word: {word[4]}, Cleaned Word: {cleaned_word}")
            print(" ")
        # ==================== Debugging ====================

    # Convert the dictionary to a pandas DataFrame
    data = {'Page Number': list(keyword_counts.keys()), 'Frequency': list(keyword_counts.values())}
    df = pd.DataFrame(data)

    # Sort the DataFrame by frequency in descending order
    df = df.sort_values(by='Frequency', ascending=False)

    # Display the DataFrame
    #print("Frequency table of detected keywords (after sorting):\n", df)

    # Filter the DataFrame to include only rows with non-zero frequencies
    df_non_zero = df[df['Frequency'] > 0]

    # Display the DataFrame
    print("Frequency table of detected keywords (after sorting):\n", df_non_zero, "\n")

    # Create a list of page numbers with frequencies
    page_numbers_with_frequencies = df_non_zero[['Page Number', 'Frequency']].values.tolist()

    return page_numbers_with_frequencies

#  Call process_pdf() to analyze frequency of detected keywords

In [7]:
# Debugged and keep

import os
import fitz  # PyMuPDF
import pandas as pd
import shutil

# Define the output directory in the mounted Google Drive
#output_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
output_dir = os.path.join(repo_dir, 'docArchive/DLdocs')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define the directory paths
#pdf_dir = os.path.expanduser('~/XtractnParse/docArchive/DLdocs')
pdf_dir = output_dir 
bad_pdf_dir = os.path.join(pdf_dir, 'bad')
os.makedirs(bad_pdf_dir, exist_ok=True)

# Define the path for the df_DL.csv file
df_DL_path = os.path.join(output_dir, 'df_DL.csv')

# Read df_DL from the CSV file
df_DL = pd.read_csv(df_DL_path)

#================================================
def has_proper_page_range(pdf_document):
    # Check if the first page number is 1
    first_page_label = pdf_document[0].get_label()
    print(f"First page label: {first_page_label}\n")  # Debug statement
    return first_page_label == "1"

def fix_page_range(pdf_document):
    # Create a new PDF with corrected page numbers
    new_pdf = fitz.open()
    for page_num in range(len(pdf_document)):
        new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
    return new_pdf
#================================================

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(output_dir) if f.endswith('.pdf')]

# Process each PDF file
for pdf_file in pdf_files:
    print(f"PDF file: {pdf_file}\n")
    pdf_path = os.path.join(output_dir, pdf_file)
    pdf_document = fitz.open(pdf_path)

    #================================================
    # Check if the PDF has a proper page range
    first_page_label = pdf_document[0].get_label()
    if first_page_label == "":
        print(f"First page label is empty for {pdf_file}\n")
        # Copy the original PDF document to the 'bad' subfolder with _nolabel suffix
        nolabel_pdf_path = os.path.join(bad_pdf_dir, pdf_file.replace('.pdf', '_nolabel.pdf'))
        shutil.copy(pdf_path, nolabel_pdf_path)
    elif not has_proper_page_range(pdf_document):
        print(f"Fixing page range for {pdf_file}\n")

        # Copy the original problematic PDF document to the 'bad' subfolder
        bad_pdf_path = os.path.join(bad_pdf_dir, pdf_file.replace('.pdf', '_bad.pdf'))
        shutil.copy(pdf_path, bad_pdf_path)

        # Fix the page range
        pdf_document = fix_page_range(pdf_document)

        # Overwrite the original problematic PDF document with the fixed PDF document
        pdf_document.save(pdf_path, incremental=False)
    #================================================
    
    # Process the PDF document and get the page numbers with frequencies
    page_numbers_with_frequencies = process_pdf(pdf_document, keywords, DetailedPage)

    # Close the PDF document
    pdf_document.close()

    # Update df_DL
    df_DL.loc[df_DL['full_urls'].str.contains(pdf_file), 'Analyzed'] = 1
    df_DL.loc[df_DL['full_urls'].str.contains(pdf_file), 'Page Numbers with Frequencies'] = str(page_numbers_with_frequencies)


# Ensure the 'Analyzed' column is integer after updates
df_DL['Analyzed'] = df_DL['Analyzed'].astype(int)

# Display the updated df_DL
print("Updated df_DL DataFrame: ")
print(df_DL)

# Save the updated df_DL to the CSV file (overwrite the existing file)
df_DL.to_csv(df_DL_path, index=False) # To append, use mode='a' 




PDF file: daimler-sustainability-report-2019.pdf

First page label is empty for daimler-sustainability-report-2019.pdf

Keywords: ['water']

Page 2's count of detected keyword(s): 0

Displaying only first 15 or less of ...
Original Word: Foreword, Cleaned Word: foreword
Original Word: 3, Cleaned Word: 3
Original Word: Change, Cleaned Word: change
Original Word: A, Cleaned Word: a
Original Word: changing, Cleaned Word: changing
Original Word: world, Cleaned Word: world
Original Word: 5, Cleaned Word: 5
Original Word: A, Cleaned Word: a
Original Word: changing, Cleaned Word: changing
Original Word: culture, Cleaned Word: culture
Original Word: 13, Cleaned Word: 13
Original Word: Partners, Cleaned Word: partners
Original Word: for, Cleaned Word: for
Original Word: change, Cleaned Word: change
Original Word: 20, Cleaned Word: 20
 
Frequency table of detected keywords (after sorting):
      Page Number  Frequency
121          122         17
120          121          9
119          120      

# Store extracted pages in folder 

In [8]:
# Debugged and keep

import os
import pymupdf
import fitz  # PyMuPDF
import pandas as pd
import ast

# Assuming df_DL and file_name are already defined and populated

# Define the extraction directory in the mounted Google Drive
extract_dir = os.path.join(repo_dir, 'docArchive/pagesExtracted')
os.makedirs(extract_dir, exist_ok=True)

# Define the directory in the mounted Google Drive with downloaded PDFs
pdf_dir = os.path.join(repo_dir, 'docArchive/DLdocs')

# Define the path for the df_DL.csv file
df_DL_path = os.path.join(pdf_dir, 'df_DL.csv')
# Read df_DL from the CSV file
df_DL = pd.read_csv(df_DL_path)


# Iterate over the rows of df_DL
for index, row in df_DL.iterrows():
    # Check if the row should be analyzed
    if row['Analyzed'] == 1:
        # Extract the PDF filename from the full_urls column
        pdf_file_name = os.path.basename(row['full_urls'])
        pdf_file_path = os.path.join(pdf_dir, pdf_file_name)
        
        # Load the PDF document
        pdf_document = fitz.open(pdf_file_path)

        # Extract the page numbers with frequencies from the DataFrame
        page_numbers_with_frequencies_str = row['Page Numbers with Frequencies']
        
        # Convert the string back to a list of tuples
        page_numbers_with_frequencies = ast.literal_eval(page_numbers_with_frequencies_str)

        # Flag to track if all pages are extracted
        all_pages_extracted = True

        # Iterate over the list of page numbers and their frequencies (where the stored page numbers are 1-indexed)
        for page_number, frequency in page_numbers_with_frequencies:
            try:

                #==============================================
                # Set page number to be 0-indexed                
                page_zeroIdx = int(page_number) - 1   # Convert page_number to an integer
                # Create a new PDF with the extracted page
                new_pdf = pymupdf.open()                 # new empty PDF
                new_pdf.insert_pdf(pdf_document, to_page = page_zeroIdx)  # up to the page_zeroIdx page
                # Remove all pages except the last one
                if new_pdf.page_count > 1:
                    new_pdf.delete_pages(range(new_pdf.page_count - 1))  # Delete all pages except the last one
                # Now new_pdf contains only the last page
                #==============================================

                # Define the output file name
                output_file_name = f"{pdf_file_name[:-4]}_[{page_number},{frequency}].pdf"
                output_file_path = os.path.join(extract_dir, output_file_name)
                
                # Save the new PDF
                new_pdf.save(output_file_path)
                new_pdf.close()
            except Exception as e:
                # If any page extraction fails, set the flag to False
                all_pages_extracted = False
                print(f"Failed to extract page {page_number} from {pdf_file_name}: {e}\n")

        # Close the original PDF document
        pdf_document.close()

        # Update the 'Extracted' column if all pages were successfully extracted
        if all_pages_extracted:
            df_DL.at[index, 'Extracted'] = 1

# Save the updated DataFrame to the corresponding .csv file
df_DL.to_csv(df_DL_path, index=False)

print("Pages extracted and saved successfully.")

print(df_DL)


Pages extracted and saved successfully.
                                                                                                                              full_urls  \
0  https://group.mercedes-benz.com/documents/company/corporate-governance/declarations/daimler-corporategovernancestatement-en-2019.pdf   
1                               https://group.mercedes-benz.com/documents/sustainability/reports/daimler-sustainability-report-2019.pdf   

   DL  Analyzed  \
0   1         1   
1   1         1   

                                                                                                                       Page Numbers with Frequencies  \
0                                                                                                                                                 []   
1  [[122, 17], [121, 9], [120, 8], [123, 4], [126, 3], [47, 2], [124, 1], [108, 1], [125, 1], [7, 1], [4, 1], [45, 1], [201, 1], [180, 1], [192, 1]]   

   Extracted  
0        1.0

---
# List pdf pages in the extraction directory

In [9]:
import os
from llama_parse import LlamaParse

# Define the extraction directory in the mounted Google Drive
extract_dir = os.path.join(repo_dir, 'docArchive/pagesExtracted')
os.makedirs(extract_dir, exist_ok=True)

# List all PDF files in the extraction directory
pdf_pages = [f for f in os.listdir(extract_dir) if f.endswith('.pdf')]

total_pages = len(pdf_pages)
print(f"Total number of PDF pages in the extraction directory: {total_pages}\n")

# Set the number of PDF pages to parse in the extraction directory
num2Parse = total_pages  # 2
print(f"PDF pages to parse: First {num2Parse} pages")

#print(f"PDF files in the extraction directory: {pdf_pages}")
for pdf_page in pdf_pages:
    print(f"PDF file in the extraction directory: {pdf_page}")

print(f"\n{pdf_pages}")


Total number of PDF pages in the extraction directory: 15

PDF pages to parse: First 15 pages
PDF file in the extraction directory: daimler-sustainability-report-2019_[123,4].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[180,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[4,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[45,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[108,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[122,17].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[192,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[201,1].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[121,9].pdf
PDF file in the extraction directory: daimler-sustainability-report-2019_[7,1].pdf
PDF file in the extraction directory: daimler-sustainability

---
---
### !!! Note: The following chunks need to be turned into a function call like in the 'Option 1' code chunk
###
# Option 2: Use OCR with Tesseract for pages not text-searchable
### Then proceed to extract words from each page and count detected keywords



In [21]:
#!sudo apt-get install tesseract-ocr
#!pip install pytesseract nltk

In [None]:

import nltk
from nltk.corpus import words as nltk_words
# Load the set of English words
nltk.download('words')
english_words = set(nltk_words.words())

#================================================================================

import re
import string
import unicodedata
from collections import defaultdict
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io


# Assuming pdf_document is already defined and loaded

keywords = ['segment', 'segments']
print(f"Keywords: {keywords}")

# Step 3: Initialize a dictionary to store the page numbers and the frequency of the keyword
keyword_counts = defaultdict(int)

# Define a regular expression pattern for numerical figures with the specified currency symbols and thousand separators
numerical_pattern = re.compile(r'^(?:[\$£€]|kr|SEK|DKK|NOK|CZK|PLN|HUF|RON|BGN|ISK|CHF)?\d{1,3}(?:,\d{3})*(\.\d+)?%?$')

# \d{1,3}(?:,\d{3})*: This matches one to three digits optionally followed by groups of three digits separated by commas.
# (\.\d+)?: This matches an optional decimal point followed by one or more digits.


# Function to normalize and clean words
def clean_word(word):
    # Normalize the word to NFKD form
    normalized_word = unicodedata.normalize('NFKD', word)
    # Remove diacritics
    cleaned_word = ''.join(c for c in normalized_word if unicodedata.category(c) != 'Mn')
    # Remove punctuation selectively
    cleaned_word = re.sub(r'(?<!\d)[.,%](?!\d)', '', cleaned_word)
    # Remove other punctuation
    cleaned_word = cleaned_word.translate(str.maketrans('', '', string.punctuation.replace('.', '').replace(',', '').replace('%', '')))
    # Remove non-ASCII characters
    cleaned_word = ''.join(c for c in cleaned_word if c in string.printable)
    return cleaned_word.lower()

# Function to extract text using OCR with Tesseract configuration for small fonts
def ocr_extract_text(page):
    # Increase the DPI of the image
    zoom = 2  # Increase the zoom factor to get a higher resolution image
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    
    img = Image.open(io.BytesIO(pix.tobytes()))
    custom_config = r'--oem 3 --psm 6 -l eng'  # Use LSTM OCR Engine, assume a single uniform block of text, and English language
    text = pytesseract.image_to_string(img, config=custom_config)
    return text

# Function to check if a word is meaningful
def is_meaningful(word):
    cleaned_word = clean_word(word)
    # Check if the word is alphabetic and in the English words set
    if cleaned_word.isalpha() and cleaned_word in english_words:
        return True
    # Check if the word matches the numerical pattern
    if numerical_pattern.match(word):
        return True
    return False

# Step 4: Loop through each page in the PDF
for page_num in range(len(pdf_document)):  #range(min(30, len(pdf_document))): 
    page = pdf_document.load_page(page_num)
    
    # Step 5: Extract the words from each page
    words = page.get_text("words")
    
    if not words:
        # If no words are extracted, use OCR
        text = ocr_extract_text(page)
        words = [(0, 0, 0, 0, word) for word in text.split()]
    
    # Check if most words are meaningless
    if words:
        meaningful_count = sum(1 for word in words if is_meaningful(word[4]))
        total_count = len(words)
        if meaningful_count / total_count < 0.5:  # Threshold can be adjusted
            # If less than 50% of the words are meaningful, use OCR
            text = ocr_extract_text(page)
            words = [(0, 0, 0, 0, word) for word in text.split()]
    
    # Display a bit of the extracted words
    words_snippet = ' '.join(word[4] for word in words[:20])  # Display the first 20 words
    print(f"Page {page_num + 1} words snippet: {words_snippet}...")
    
    page_in_concern = 21
    # Display the complete words of {page_in_concern+1}
    if page_num == page_in_concern:  # Page numbers are zero-indexed
        complete_words = ' '.join(word[4] for word in words)
        print(f"Complete words of {page_in_concern+1}:\n{complete_words}")
    
    # Step 6: Count the occurrences of the keywords "segments" or "segment" (case-insensitive) in the words
    count = sum(1 for word in words if clean_word(word[4]) in keywords)
    keyword_counts[page_num] += count
    
    # Debug: Print the count for each page
    print(f"Page {page_num + 1} count of detected keyword: {count}")

    # Additional debug: Print each word on the page in concern after cleaning
    if page_num == page_in_concern:
        for word in words:
            cleaned_word = clean_word(word[4])
            print(f"Original Word: {word[4]}, Cleaned Word: {cleaned_word}")

