## Imports

In [None]:
import pdfplumber
import pandas as pd
import numpy as np
from datetime import datetime
import os

## Functions

In [None]:
def get_date(str_date):
    date = datetime.strptime(str_date, '%d-%b%Y')
    return date

def is_date(date):
    try:
        get_date(date)
        return True
    except ValueError:
        return False

def is_float(str_float):
    try:
        float(str_float)
        if str_float[-3] == ".":
            return True
        else:
            return False
    except ValueError:
        return False

In [None]:
def enter_row_into_statement_dict(date, details, amount, balance):
    global statement_dict
    statement_dict["Date"] += [date]
    statement_dict["Transaction Details"] += [details]
    statement_dict["Amount"] += [amount]
    statement_dict["Balance"] += [balance]
    
def clear_statement_dict():
    global statement_dict
    statement_dict["Date"] = []
    statement_dict["Transaction Details"] = []
    statement_dict["Amount"] = []
    statement_dict["Balance"] = []
    
def clear_statement_globals():
    global temp_row
    temp_row["date"] = ""
    temp_row["balance"] = ""
    temp_row["amount"] = ""
    temp_row["details"] = ""

In [None]:
def adjust_year(line, invoice):
    invoice_is_jan = "-01." in invoice
    line_has_dec = "-Dec " in line
    line_has_no_date = is_date(line.split()[0])
    year = invoice[invoice.find('/')+1:invoice.find('-')]

    if invoice_is_jan and line_has_dec:
        return line.replace("-Dec", f"-Dec{int(year) - 1}")
    elif line_has_no_date:
        return line
    else:
        return f"{line.split()[0] + year} {' '.join(line.split()[1:])}"

In [None]:
def add_to_previous_details(items):
    global temp_row
    
    temp_row["details"] = temp_row["details"] + " " + " ".join(items)

def add_row_to_dic(items, starts_with_date):
    global temp_row

    has_two_amounts = is_float(items[-2])
    
    if starts_with_date:
        enter_row_into_statement_dict(temp_row["date"], temp_row["details"], temp_row["amount"], temp_row["balance"])
        temp_row["date"] = get_date(items[0])

        if has_two_amounts:
            temp_row["balance"] = items[-1]
            temp_row["amount"] = items[-2]
            temp_row["details"] = " ".join(items[1:-2])
        else:
            temp_row["amount"] = items[-1]
            temp_row["details"] = " ".join(items[1:-1])  
    else:
        enter_row_into_statement_dict(temp_row["date"], temp_row["details"], temp_row["amount"], "")
 
        if has_two_amounts:
            temp_row["balance"] = items[-1]
            temp_row["amount"] = items[-2]
            temp_row["details"] = " ".join(items[0:-2])
        else:
            temp_row["amount"] = items[-1]
            temp_row["details"] = " ".join(items[0:-1])  

def add_first_row_to_dic(items):
    global temp_row

    temp_row["date"] = get_date(items[0])
    temp_row["details"] = " ".join(items[1:-1])
    temp_row["balance"] = items[-1].replace(",","")

def prepare_statement_dict(statement, invoice):
    for line in statement:
        items = adjust_year(line, invoice).split()

        last_item_is_number = is_float(items[-1])
        details_entered_yet = temp_row["details"] != ""
        starts_with_date = is_date(items[0])

        if starts_with_date and last_item_is_number and not details_entered_yet:
            add_first_row_to_dic(items)
        elif starts_with_date and last_item_is_number:
            add_row_to_dic(items, True)
        elif not starts_with_date and last_item_is_number:
            add_row_to_dic(items, False)
        elif not starts_with_date and not last_item_is_number:
            add_to_previous_details(items)
            
    enter_row_into_statement_dict(temp_row["date"], temp_row["details"], "", temp_row["balance"])


In [None]:
def get_statement(text):    
    start = [i for i, s in enumerate(text.split("\n")) if 'B/F BALANCE' in s]
    end = [i for i, s in enumerate(text.split("\n")) if 'C/F BALANCE' in s]

    statement = [row.replace(",", "") for row in text.split("\n")[start[0]:end[0]+1]]
    return statement

In [None]:
def convert(invoices):
    global statement_dict
    
    for invoice in invoices:
        with pdfplumber.open(invoice) as pdf:
            page = pdf.pages[0]
            text = page.extract_text(x_tolerance=1)

        statement = get_statement(text)
        prepare_statement_dict(statement, invoice)
        clear_statement_globals()
        
    df = pd.DataFrame(statement_dict, columns= ["Date", "Transaction Details", "Amount", "Balance"])
    return df

## Global Variables

In [None]:
statement_dict = {
    "Date": [],
    "Transaction Details": [], 
    "Amount": [],
    "Balance": [] 
}

In [None]:
temp_row = {
    "date": "",
    "balance": "",
    "amount": "",
    "details": ""
}

# ==========================================

## START HERE

In [None]:
# invoices = [f'statements/{string}'for string in os.listdir(f'{os.getcwd()}/statements') if string[0] != '.']
invoices

In [None]:
df = convert(invoices).sort_values(by="Date")

In [None]:
df.to_csv("statement.csv", index=False, mode='w+')

# ==========================================

## Clean Up

In [None]:
clear_statement_dict()