# 💳 OCR Transaction Extractor - April 2025

This notebook extracts transactions from bank statement screenshots using Tesseract OCR and saves them to a CSV.

In [None]:
# 📦 Install dependencies
!pip install pytesseract opencv-python pandas numpy

In [None]:
import cv2
import pytesseract
import pandas as pd
import re
from datetime import datetime

In [None]:
!tesseract --version

### 🔧 Month mapping for Croatian abbreviations

In [None]:
month_map = {
    "SIJ": 1, "VEL": 2, "OŽU": 3, "TRA": 4, "SVI": 5,
    "LIP": 6, "SRP": 7, "KOL": 8, "RUJ": 9, "LIS": 10,
    "STU": 11, "PRO": 12
}

In [None]:
year=2025

### 📥 OCR + Regex Parser Function

In [None]:
def extract_transactions_from_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    raw_text = pytesseract.image_to_string(gray)
    lines = raw_text.split('\n')
    data = []
    current_date = ""

    for i, line in enumerate(lines):
        date_match = re.match(r'(\d{1,2})\s+([A-ZČŽŠ]{3})', line.strip())
        if date_match:
            day, month_abbr = date_match.groups()
            day = int(day)
            month = month_map.get(month_abbr.upper(), 4)
            current_date = datetime(year, month, day).strftime('%Y-%m-%d')
        elif "EUR" in line:
            amount_match = re.search(r'([-+]?\d+,\d+)\s*EUR', line)
            if amount_match and i > 0:
                amount = amount_match.group(1).replace(',', '.')
                prev_line = lines[i-1].strip()
                data.append({
                    'Date': current_date,
                    'Description': prev_line,
                    'Amount (EUR)': float(amount)
                })
    return pd.DataFrame(data)

### 🔍 Load an Image and Extract Transactions

In [None]:
# Replace 'your_image.png' with your file path
df = extract_transactions_from_image("data/sensitive/test_image.png")
df.to_csv("data/sensitive/transactions.csv", index=False)
df.head()