In [93]:
import json
import re

with open('output/text_with_label.json', encoding='utf-8') as file:
    data = json.load(file)
    
texts = [item['text'] for item in data]

for text in texts[:1]:
    print(text)

Nasi	Campur	Baii	75,ooo
Bbk Bengil	Nasi	125,ooo
Milkshake	Starwb
Ice Lemon	Tea	24 , OOo
Nasi	Ayan	Dewata	70,ooo
Fr2s	Ice	Tea
Organic	Ereen	5a	65,Ooo
ice	Tea	16,Ooo
Ice Orange	29,Qoo
Ayan Suir	Pali
2	Tahu Goreng	36,OQo
36,0oo
2
Tahu Telor	Asin	40,Ooo.
70.Ooo
Nasi	Gcrang	Sanc
San	366.Ooo
3	Bbk Panggang
Ayan Samtzl	Hsja	52,Ooo
2	Hot Tea
Ice Kopi	32 ,Qoo
Tahu	Telor Asin
Free	Ice	Ted
Bebek	Street
Ice	Tez	Tawar
Sub-Total	1,346 , Qoo
Service	1oo,950
PBI	144,695
Rounding
1	591
Grand	Total


In [94]:
def clean_ocr_number(text):
    char_corrections = {
        'o': '0',
        'O': '0',
        'Q': '0',
        'q': '0',
        'D': '0',
        'l': '1',
        'I': '1',
        'i': '1',
        'Z': '2',
        'z': '2',
        'B': '8',
        'b': '8',
        'g': '9',
        'G': '9',
        's': '5',
        'S': '5',
        '': ''  
    }
    
    cleaned = ''
    for char in text:
        cleaned += char_corrections.get(char, char)
    
    return cleaned

In [None]:
def clean_amount(amount_str):
    allowed_chars = re.sub(r'[^0-9,oOQqlIiZzBbgGsS ]', '', amount_str)
    cleaned = clean_ocr_number(allowed_chars)
 
    try:
        if ',' in cleaned and '.' in cleaned:
            cleaned = cleaned.replace(',', '.')
        elif ',' in cleaned:
            cleaned = cleaned.replace(',', '')
        elif '.' in cleaned:
            cleaned = cleaned.replace('.', '')
        return float(cleaned)
    except ValueError:
        return None


In [96]:
def process_matches(patterns, text):
    results = []
    
    for pattern in patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            label = match.group(1)  
            raw_amount = match.group(2)
            
            cleaned_amount = clean_amount(raw_amount)
            if cleaned_amount:
                results.append({
                    'label': label,
                    'amount': cleaned_amount,
                    'raw_amount': raw_amount,
                    'cleaned_amount': clean_ocr_number(raw_amount),
                    'full_match': match.group(0)
                })
    return results

In [108]:
def extract_amounts(text):
    total_patterns = [
    r'(?i)(total|grand\s*-?\s*total|sub\s*-?\s*total|subtotal|total\s*bayar|jumlah)\s*:?\s*([0-9,.oOQqlIiZzBbgGsS\s]+)',
    ]   

    tax_patterns = [
    r'(?i)(tax|pajak|ppn|vat|service\s*tax|pajak\s*ppn|pbi)\s*:?\s*([0-9,.oOQqlIiZzBbgGsS\s]+)',  
    ]   


    total = process_matches(total_patterns, text)
    tax = process_matches(tax_patterns, text)
    
    return total, tax

In [98]:
def validate_amount(amount, min_value=200, max_value=10000000):
    if amount is None:
        return False
    return min_value <= amount <= max_value

In [None]:
def process_receipts(file_path):

    results = []
    
    for item in data:
        text = item['text']
        totals, taxes = extract_amounts(text)
        
        valid_totals = [t for t in totals if validate_amount(t['amount'])]

        final_total = max([t['amount'] for t in valid_totals]) if valid_totals else None

        valid_taxes = [t for t in taxes if validate_amount(t['amount'])]
        
        result = {
            'original_text': text,
            'found_totals': [{
                'label': t['label'],
                'amount': t['amount'],
                'raw_text': t['raw_amount'],
                'cleaned_text': t['cleaned_amount']
            } for t in valid_totals],
            'found_taxes': [{
                'label': t['label'],
                'amount': t['amount'],
                'raw_text': t['raw_amount'],
                'cleaned_text': t['cleaned_amount']
            } for t in valid_taxes],
            'predicted_total': final_total,
            'actual_total': item.get('total_harga')
        }
        results.append(result)
    
    return results

In [100]:
def save_results_to_json(results, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(results, json_file, indent=4)

In [None]:
results = process_receipts(data)
output_file = 'receipt-.json'  
save_results_to_json(results, output_file)

In [110]:

def test_patterns():
    """
    Fungsi untuk testing pattern regex
    """
    test_cases = [
        "TOTAL 100, 000",
        "Total: 100,000",
        "total: 100,0o0",
        "ToTaL: 100,000",
        "GRAND TOTAL: 100,000",
        "Grand Total: 100,000",
        "SUB TOTAL: 100,000",
        "Sub-Total: 100,000",
        "SUBTOTAL: 100,000",
        "TAX: 10,000",
        "Tax: 10,000",
        "tax: 10,000",
        "PAJAK PPN 10 10,000",
        "Pajak ppn 10%: 10,000",
        "SERVICE TAX: 10,000",
        "Service Tax: 10,000",
        "toTaL 1.000.000"
    ]
    
    print("Testing patterns...")
    for test_text in test_cases:
        totals, taxes = extract_amounts(test_text)
        print(f"\nTesting: {test_text}")
        if totals:
            print("Found totals:", [t['amount'] for t in totals])
        if taxes:
            print("Found taxes:", [t['amount'] for t in taxes])

In [114]:
test_patterns()

Testing patterns...

Testing: TOTAL 100, 000

Testing: Total: 100,000
Found totals: [100000.0]

Testing: total: 100,0o0
Found totals: [100000.0]

Testing: ToTaL: 100,000
Found totals: [100000.0]

Testing: GRAND TOTAL: 100,000
Found totals: [100000.0]

Testing: Grand Total: 100,000
Found totals: [100000.0]

Testing: SUB TOTAL: 100,000
Found totals: [100000.0]

Testing: Sub-Total: 100,000
Found totals: [100000.0]

Testing: SUBTOTAL: 100,000
Found totals: [100000.0]

Testing: TAX: 10,000
Found taxes: [10000.0]

Testing: Tax: 10,000
Found taxes: [10000.0]

Testing: tax: 10,000
Found taxes: [10000.0]

Testing: PAJAK PPN 10 10,000

Testing: Pajak ppn 10%: 10,000
Found taxes: [10.0]

Testing: SERVICE TAX: 10,000
Found taxes: [10000.0]

Testing: Service Tax: 10,000
Found taxes: [10000.0]

Testing: toTaL 1.000.000
Found totals: [1000000.0]
