# 💳 OCR Transaction Extractor - April 2025

This notebook extracts transactions from bank statement screenshots using Tesseract OCR and saves them to a CSV.

In [8]:
# 📦 Install dependencies
!pip install pytesseract opencv-python pandas numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import cv2
import pytesseract
import pandas as pd
import re
from datetime import datetime

In [10]:
!tesseract --version

tesseract 5.5.1
 leptonica-1.85.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.0.4) : libpng 1.6.50 : libtiff 4.7.0 : zlib 1.2.12 : libwebp 1.5.0 : libopenjp2 2.5.3
 Found NEON
 Found libarchive 3.8.1 zlib/1.2.12 liblzma/5.8.1 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.64.0


### 🔧 Month mapping for Croatian abbreviations

In [11]:
month_map = {
    "SIJ": 1, "VEL": 2, "OŽU": 3, "TRA": 4, "SVI": 5,
    "LIP": 6, "SRP": 7, "KOL": 8, "RUJ": 9, "LIS": 10,
    "STU": 11, "PRO": 12
}

In [12]:
year=2025

### 📥 OCR + Regex Parser Function

In [13]:
def extract_transactions_from_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    raw_text = pytesseract.image_to_string(gray)
    lines = raw_text.split('\n')
    data = []
    current_date = ""

    for i, line in enumerate(lines):
        date_match = re.match(r'(\d{1,2})\s+([A-ZČŽŠ]{3})', line.strip())
        if date_match:
            day, month_abbr = date_match.groups()
            day = int(day)
            month = month_map.get(month_abbr.upper(), 4)
            current_date = datetime(year, month, day).strftime('%Y-%m-%d')
        elif "EUR" in line:
            amount_match = re.search(r'([-+]?\d+,\d+)\s*EUR', line)
            if amount_match and i > 0:
                amount = amount_match.group(1).replace(',', '.')
                prev_line = lines[i-1].strip()
                data.append({
                    'Date': current_date,
                    'Description': prev_line,
                    'Amount (EUR)': float(amount)
                })
    return pd.DataFrame(data)

### 🔍 Load an Image and Extract Transactions

In [14]:
# Replace 'your_image.png' with your file path
df = extract_transactions_from_image("data/sensitive/test_image.png")
df.to_csv("data/sensitive/transactions.csv", index=False)
df.head()

Unnamed: 0,Date,Description,Amount (EUR)
0,,>— Ukupan iznos,909.69
1,,27 Wolt Zagreb,-21.92
2,,27 Glovo 26APR ZAJI68RUD Zagreb,-21.44
3,2025-04-27,27 UBR* PENDING.UBER.COM Amsterdam,-9.4
4,2025-04-27,,4.9
