In [190]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import pandas as pd
from datetime import datetime

In [37]:
#CONST
CATEGORIES = ['Date & Time', 'Source/Destination', 'Transaction Details', 'Notes', 'Amount', 'Balance']
PDF_PATH = '../data/transaction/Jago_Main Pocket_History.pdf'
EOF_CONST = 'PT Bank Jago Tbk is licensed and supervised by Financial Services Authority (OJK), Bank Indonesia, and'

In [108]:
CATEGORIES[:3]

['Date & Time', 'Source/Destination', 'Transaction Details']

In [109]:
CATEGORIES.index('Date & Time')

0

In [184]:
def read_pdf(pdf_path=PDF_PATH):
    return fitz.open(pdf_path)
def get_pdf_pages(pdf_document):
    pages = []
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pages.append(page)
    return pages
def find_ROI(pdf_page):
    roi_pos = {}
    page = pdf_page.get_text("dict")
    # roi_pos['width'] = page['width']
    # roi_pos['height'] = page['height']
    # Iterate through blocks of text
    for block in page["blocks"]:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span['text']
                    if text in CATEGORIES:
                        roi_pos[text] = span['bbox']
                    if EOF_CONST in text:
                        roi_pos['EOF'] = span['bbox']
    return roi_pos
def post_process_roi(roi_pos):
    for i,cat in enumerate(CATEGORIES):
        if cat in CATEGORIES[:3]:
            #new x_end = next category x_start -5 just to differentiate a little
            roi_pos[cat]=(roi_pos[cat][0], roi_pos[cat][1], roi_pos[CATEGORIES[CATEGORIES.index(cat)+1]][0]-5, roi_pos[cat][3])
        if cat == 'Notes':
            #new x_end = (x_end skrg+ x_start selanjutnya)/2
            new_val = (roi_pos[cat][2]+roi_pos[CATEGORIES[CATEGORIES.index(cat)+1]][0])/2
            roi_pos[cat]=(roi_pos[cat][0], roi_pos[cat][1], new_val, roi_pos[cat][3])
        if cat == 'Amount':
            #new x_start = (x_start skrg+ x_start sebelumnya)/2
            new_val = (roi_pos[cat][0]+roi_pos[CATEGORIES[CATEGORIES.index(cat)-1]][2])/2
            roi_pos[cat]=(new_val, roi_pos[cat][1], roi_pos[cat][2], roi_pos[cat][3])
        if cat == 'Balance':
            #new x_start = previous x_end +5 just to differentiate a little
            roi_pos[cat]=(roi_pos[CATEGORIES[CATEGORIES.index(cat)-1]][2]+5, roi_pos[cat][1], roi_pos[cat][2], roi_pos[cat][3])
    return roi_pos
def crop_image(roi_pos):
    # Extract text from each ROI
    all_crop = {}
    for key, bbox in roi_pos.items():
        if key != 'EOF':
            #Find ROI
            crop_pos= (bbox[0], bbox[1], bbox[2], roi_pos['EOF'][1])
            # Crop the image to the ROI
            cropped_image = image.crop(crop_pos)
            all_crop[key] = cropped_image
    return all_crop

need to post-process the date, because we need the y-start of each date to process the other categories

In [192]:
def extract_date_df(cropped_image):
    #get y_start and text
    ocr_data = pytesseract.image_to_data(cropped_image, output_type=pytesseract.Output.DICT)
    all_texts = []
    all_y_start = []
    # Extract text and bounding boxes
    for i in range(len(ocr_data['level'])):
        (x, y, w, h) = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
        text = ocr_data['text'][i]
        conf = int(ocr_data['conf'][i])

        if conf > 0:  # Filter out low-confidence results
            all_texts.append(text)
            all_y_start.append(y)
            #print(f"Text: {text}, BBox: ({x}, {y}, {x+w}, {y+h}), Confidence: {conf}")
    data = {
        'text' : all_texts,
        'y_start' : all_y_start
    }
    df = pd.DataFrame(data)
    df = df.groupby('y_start').agg(lambda x:' '.join(x)).reset_index()
    return df

In [194]:
#filter so it only hold valid date
def post_process_date_df(df):
    # Function to validate and convert date
    def validate_and_convert_date(date_str):
        try:
            # Try to parse the date
            date = datetime.strptime(date_str, '%d %b %Y')
            # Convert to the desired format
            return date.strftime('%d/%m/%Y')
        except ValueError:
            # Return None if the date is invalid
            return None
    # Apply the function to the date column
    df['date'] = df['text'].apply(validate_and_convert_date)

    # Filter out rows with invalid dates
    valid_dates_df = df[df['date'].notna()]
    return valid_dates_df.drop(columns='text')

In [185]:
pdf_file = read_pdf()
pdf_pages = get_pdf_pages(pdf_file)
roi_pos = find_ROI(pdf_pages[0])
roi_pos = post_process_roi(roi_pos)
all_crop = crop_image(roi_pos)

In [195]:
date_df = extract_date_df(all_crop['Date & Time'])
date_df = post_process_date_df(date_df)
date_df

Unnamed: 0,y_start,date
2,138,16/02/2024
4,278,16/02/2024
6,382,16/02/2024
8,486,16/02/2024
10,618,18/02/2024
12,722,18/02/2024
14,862,18/02/2024
16,1002,19/02/2024
18,1134,19/02/2024
20,1271,20/02/2024


process the other Source/Destination category