In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/232.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import os
import google.generativeai as genai
import requests
from PyPDF2 import PdfReader
import shutil
import time
from functools import wraps
from google.api_core.exceptions import ResourceExhausted

In [None]:
FOLDER_NAME = './input'

# Set your Gemini API key
GOOGLE_API_KEY=''

genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-pro-latest')


In [None]:
def retry_on_resource_exhausted(max_retries=5, base_delay=1.0, backoff_factor=2):
    """
    Decorator that retries a function if ResourceExhausted error is raised.
    It uses an exponential backoff strategy to wait between retries.

    :param max_retries: Maximum number of retries before giving up.
    :param base_delay: Initial delay between retries in seconds.
    :param backoff_factor: Multiplier by which to increase the delay each retry.
    """
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = base_delay
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except ResourceExhausted as e:
                    if attempt == max_retries - 1:
                        raise  # Re-raise the last exception if max retries are exceeded
                    print(f"Resource exhausted, retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= backoff_factor
        return wrapper
    return decorator

In [None]:
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [None]:
@retry_on_resource_exhausted(max_retries=5, base_delay=20, backoff_factor=1.5)
def ask_question_about_text(text, question):
    response = model.generate_content("You are a helpful assistant. " + text + "\n" + question)
    try:
        return response.text
    except:
        print('Failed response %s' % response)
        return ''

In [None]:
def get_check_name(pdf_file_path):
    question = '''
    Instructions:
        Using next information identify date of document, company (who sent it, if unable to identify - assume 'unknown') and summary of document in a few words and as answer return it in format: yyyy-mm-dd-{company-readable-name}-{summary}.pdf
        If you can identify day or month - assume '00', if you cant identify year - assume '0000'.
        This will be a file name so do not put characters that can not be in file name. Split words with -.
        The information with some typos. Please identify and correct them. Remember to maintain the original meaning of the text, only correcting spelling mistakes.
        As the response - return filename only (even if you assumed as instructed), no additional explanations. If you unable to return file name - explain why.
    '''
    # Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_file_path)

    # Ask a question about the extracted text
    answer = ask_question_about_text(pdf_text[:1000], question)
    return answer.strip()

In [None]:
def get_input_pdf_names(folder):

    # List all files in the specified directory
    all_files = os.listdir(folder)

    # Filter out files with a .pdf extension
    pdf_files = [file for file in all_files if file.lower().endswith('.pdf')]

    # Return the list of PDF files
    return pdf_files

In [None]:
def move_file_with_unique_name(source, destination):
    # Check if the destination file exists
    if not os.path.exists(destination):
        # If it doesn't exist, move the file
        shutil.move(source, destination)
        print(f"File moved to {destination}")
    else:
        # If the file exists, find a new file name
        base, extension = os.path.splitext(destination)
        counter = 1
        new_destination = f"{base}-{counter}{extension}"
        # Increment the counter until the file does not exist
        while os.path.exists(new_destination):
            counter += 1
            new_destination = f"{base}-{counter}{extension}"

        # Move the file to the new destination
        shutil.move(source, new_destination)
        print(f"File moved to {new_destination}")

In [None]:
def main():
    for pdf_file in get_input_pdf_names('./input/'):
        check_file_name = get_check_name('./input/%s' % pdf_file)
        print(pdf_file)
        print(check_file_name)
        if not check_file_name.endswith('.pdf') or len(check_file_name) > 160:
            move_file_with_unique_name('./input/%s' % pdf_file, './unable_to_scan/%s' % pdf_file)
            print ('Unable to scan requested file: %s got %s' % (pdf_file, check_file_name))
        else:
          move_file_with_unique_name('./input/%s' % pdf_file, './output/%s' % check_file_name)
        print("")

In [None]:
main()

123_test_for_grmini.pdf
2024-09-25-Spectrum-Account-Statement.pdf
File moved to ./output/2024-09-25-Spectrum-Account-Statement.pdf

