In [None]:
!git clone https://github.com/EmanueleCeglia/PDF-Metadata-Extractor.git

In [2]:
%cd PDF-Metadata-Extractor

/content/PDF-Metadata-Extractor


In [None]:
!pip install -r requirements.txt

In [None]:
!apt-get install ghostscript -y

## Extract Dates

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from pdf_extractor import PDFDatesFinderSemanticSearch  #old method
from pdf_extractor import PDFDatesFinderSpace   #new method
from pdf_extractor import PDFDeductiblesFinder
import os
import re
from tqdm import tqdm
from google.colab import drive
import json

Download model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Open-Orca/OpenOrca-Platypus2-13B")
model = AutoModelForCausalLM.from_pretrained("Open-Orca/OpenOrca-Platypus2-13B", device_map="auto", load_in_8bit=True)

In [None]:
def post_process_response_dates(text):
    start_date_pattern = r"Start date:\s*([0-9]+[a-z]*\s+[A-Z][a-z]+\s+[0-9]{4})"
    end_date_pattern = r"End date:\s*([0-9]+[a-z]*\s+[A-Z][a-z]+\s+[0-9]{4})"

    start_date_match = re.search(start_date_pattern, text)
    end_date_match = re.search(end_date_pattern, text)

    result = {}
    if start_date_match and end_date_match:
        result['Start date'] = start_date_match.group(1)
        result['End date'] = end_date_match.group(1)
    else:
        return None

    return result

# second level filter
def find_dates_regex(string):
    pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b|\b\d{1,2} [A-Za-z]+ \d{4}\b|\b\d{1,2}(?:st|nd|rd|th)? [A-Za-z]+ \d{4}\b|\b[A-Za-z]+ \d{1,2}(?:st|nd|rd|th)? \d{4}\b'
    dates = re.findall(pattern, string)
    if len(dates) >= 2:
        return f"Start date: {dates[0]} End date: {dates[1]}"
    elif len(dates) == 1:
        return f"Start date: {dates[0]} End date: None"
    else:
        return "No dates found"

Dates extraction with OLD METHOD (don't use)

In [None]:
insurances_folder_path = "/content/PDF-Metadata-Extractor/Insurances"

extracted_dates = {}

for root, dirs, files in os.walk(insurances_folder_path):
    for file_name in tqdm(files):
        full_file_path = os.path.join(root, file_name)

        extraction = PDFDatesFinderSemanticSearch(full_file_path)
        extraction.load_pdf()
        extraction.process_text()
        dates = extraction.find_dates()

        for value in dates.values():
          for phrase in value.values():
            if len(phrase):
              sentence = phrase[0]

              prompt = "Find start date and end date from the following sentence: " + sentence
              inputs = tokenizer(prompt, return_tensors="pt")
              generate_ids = model.generate(inputs.input_ids, max_length=1000)
              response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
              response = post_process_response_dates(response)

              if response:
                extracted_dates[str(file_name)] = response
                print(str(file_name) + ' EXTRACTED!')
              else:
                print(str(file_name) + ' NOT FOUND!')

##Dates extraction with NEW METHOD

In [None]:
insurances_folder_path = "/content/PDF-Metadata-Extractor/Insurances"

extracted_dates = {}

for root, dirs, files in os.walk(insurances_folder_path):
    for file_name in tqdm(files):
        full_file_path = os.path.join(root, file_name)

        # Dates extraction
        extractor_dates = PDFDatesFinderSpace(full_file_path)
        pages, tables = extractor_dates.extract_mytext()
        paragraphs = [extractor_dates.identify_paragraphs_space(page) for page in pages]

        # keywords filter 1th level
        check_kw = False
        for phrase in paragraphs:
          for sentence in phrase:
            if re.search(r'\bperiod\b', sentence, re.IGNORECASE):
              check_kw = True

        if check_kw:
          # Use a list comprehension with regex to keep only the phrases that contain the word "period" (case-insensitive)
          paragraphs = [sublist for sublist in paragraphs if any(re.search(r'\bperiod\b', phrase, re.IGNORECASE) for phrase in sublist)]

          # Now, further filter each sublist to keep only the phrases that contain the word "period"
          paragraphs = [[phrase for phrase in sublist if re.search(r'\bperiod\b', phrase, re.IGNORECASE)] for sublist in paragraphs]


        responses = []
        for phrase in paragraphs:
          if len(phrase):
            for sentence in phrase:
              sentcence = sentence.replace(',', '')
              prompt = "Find start date and end date from the following sentence: " + sentence
              inputs = tokenizer(prompt, return_tensors="pt")
              generate_ids = model.generate(inputs.input_ids, max_length=200)
              response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
              response = post_process_response_dates(response)

              if response:
                  responses.append(response)
                  #print(response)

        if len(responses)==0:
          # Parse with regex for prompt fails
          for phrase in paragraphs:
            if len(phrase):
              if len(phrase)>1:
                phrase = [' '.join(phrase)]
              for sentence in phrase:
                  sentence = sentence.replace(',','')
                  response = find_dates_regex(sentence)
                  responses.append(response)

        if responses:
          print(str(file_name) + ' EXTRACTED!')
          extracted_dates[str(file_name)] = responses
        else:
          print(str(file_name) + ' NOT FOUND!')

Save final output in gdrive

In [11]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
path_dates = "/content/drive/MyDrive/PDFExtractor /Extracted Dates/extracted_dates.json"

# Scrive il dizionario in un file JSON
with open(path_dates, 'w') as file:
    json.dump(extracted_dates, file)

print("File JSON salvato con successo!")

File JSON salvato con successo!


##Deductibles extraction

In [12]:
insurances_folder_path = "/content/PDF-Metadata-Extractor/Insurances"

# Destination folder in which save deductibles .txt
path_deductibles = "/content/drive/MyDrive/PDFExtractor /Extracted Deductibles"

deductibles = {}

for root, dirs, files in os.walk(insurances_folder_path):
    for file_name in tqdm(files):
        full_file_path = os.path.join(root, file_name)

        extractor_deductibles = PDFDeductiblesFinder(full_file_path)
        pages, pages_words, tables = extractor_deductibles.extract_mytext()
        pages_with_ded = extractor_deductibles.identify_deductibles_pages(pages, pages_words)
        deductibles[file_name] = pages_with_ded

        file_name_txt = os.path.splitext(file_name)[0] + ".txt"
        file_path_txt = os.path.join(path_deductibles, file_name_txt)
        with open(file_path_txt, 'w', encoding='utf-8') as file:
            for page in pages_with_ded:
                file.write(page)

100%|██████████| 1/1 [00:37<00:00, 37.72s/it]
