In [185]:
from pypdf import PdfReader
import re

reader = PdfReader("output/open-statement.pdf") 


### Extract text

In [186]:
# Extracts without the footer part
text = ""
for page in reader.pages:
    whole_page = page.extract_text() + "\n"
    end = whole_page.find("Disclaimer")
    text+=whole_page[:end]   

In [187]:
# Remove redundant table headers
def remove_table_headers(text):
    start = text.find("Receipt")
    end = text[start:].find("\n") + 1
    text = text [:start + end] + text[start + end:].replace("Receipt No. Completion Time Details Transaction Status Paid In Withdrawn Balance ", "")
    return text
text = remove_table_headers(text=text)

### Customer Information

In [188]:
def get_customer_name():
    start = text.find("Customer Name")
    length = text[start:].find("\n")
    line = text[start:start+length]
    return line.split(":")[1]

In [189]:
print(get_customer_name())

 ANDREW KAMAU KIMANI


### Statement Summary

In [190]:
text.find("Statement Period")

153

In [191]:
text.find("Receipt")

530

### Transactions

In [207]:
transaction_types = {
    "Customer Transfer": "Send money",
    "Funds received": "Receive money",
    "Send Money Reversal": "Reversal",
    "Recharge": "Airtime",
    "Airtime Purchase":"Airtime",
    "Customer Bundle Purchase": "Airtime",
    "Pay Bill": "Paybill",
    "Customer Payment to SmallBusiness": "Pochi la Biashara",
    "Merchant Payment": "Buy Goods",
    "Business Payment": "Receive money from bank/org",
    "Receive funds from": "Receive money from bank/org",
    "Transfer from Bank": "Receive money from bank/org",
    "IMTReceive": "International Money Transfer Receipt",
    "Receive International": "International Money Transfer Receipt",
    "Send Money Abroad": "International Money Transfer Sent",
    "Customer Withdrawal": "Withdraw Money",
    "Deposit": "Deposit Cash",
    "M-Shwari Loan": "M-Shwari Loan",
    "Withdrawal Charge": "Safaricom Charges",
    "IMT Send Charge": "Safaricom Charges",
    "Pay Merchant Charge": "Safaricom Charges",
    "": "No description" 
}

In [208]:
def transaction_mapper(description: str):
    if 'to' in description:
        direction = "Out"
    elif "from" in description:
        direction = "In"
    else:
        direction = "Out"
        
    for transaction_type in transaction_types.keys():
        if transaction_type in description:
            t_type = transaction_types[transaction_type]
            break
        else:
            t_type = f"Unidentified: {description}"
    return t_type, direction
        

In [213]:
def get_transactions():
    start = text.find("Receipt")
    start += text[start:].find("\n") + 1
    section = text[start:]
    pattern = r'(?=\bS[A-Z][A-Z0-9]{8}\b)'
    chunks = re.split(pattern, section)
    chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
    transactions = []
    for chunk in chunks:
        transaction = {}
        transaction["Transaction code"] = chunk[:10]
        transaction["Date"] = chunk[10:30].strip()
        info = chunk[30:].split("Completed ")
        mid = info[0].find(" -")
        
        if mid != -1:
            description = info[0].split(" -")
            transaction["Type"], transaction["Direction"] = transaction_mapper(description[0].strip().replace("\n",""))
            transaction["Party"] = description[1].strip().replace("\n","")
        if mid == -1:
            transaction["Type"] = transaction_mapper(info[0].strip())
            transaction["Party"] = "Safaricom"            
        
        
        
        amounts = info[1].split(" ")
        transaction["Amount"] = amounts[0]
        transaction["Balance"] = amounts[1]
        
        transactions.append(transaction)
    return transactions

In [214]:
transactions = get_transactions()

In [215]:
for transaction in transactions:
    print(transaction, "\n", "-"*40)

{'Transaction code': 'SLS85T3UIS', 'Date': '2024-12-28 17:14:15', 'Type': 'Send money', 'Direction': 'Out', 'Party': '07******374 Kevin Kuya', 'Amount': '-50.00', 'Balance': '9,573.02'} 
 ----------------------------------------
{'Transaction code': 'SLR73F2QIX', 'Date': '2024-12-27 22:56:45', 'Type': ('Paybill', 'Out'), 'Party': 'Safaricom', 'Amount': '-5.00', 'Balance': '9,623.02'} 
 ----------------------------------------
{'Transaction code': 'SLR73F2QIX', 'Date': '2024-12-27 22:56:45', 'Type': 'Paybill', 'Direction': 'Out', 'Party': 'KPLCPREPAID Acc. 54607518138', 'Amount': '-300.00', 'Balance': '9,628.02'} 
 ----------------------------------------
{'Transaction code': 'SLR93EZMK5', 'Date': '2024-12-27 22:55:21', 'Type': 'Receive money', 'Direction': 'In', 'Party': '2547******189 HANNAH KAMAU', 'Amount': '330.00', 'Balance': '9,928.02'} 
 ----------------------------------------
{'Transaction code': 'SLP0TLJO9S', 'Date': '2024-12-25 17:26:22', 'Type': 'Pochi la Biashara', 'Direct