From 2d826cf9e6d96f8498411ea945494cabdd0dd7a8 Mon Sep 17 00:00:00 2001 From: Bizzaro Date: Fri, 15 Jul 2022 19:34:17 -0400 Subject: [PATCH] add fix for incorrect years for jan statements --- README.md | 3 ++- requirements.txt | 10 +++++++++- teller/model.py | 1 - teller/pdf_processor.py | 16 +++++++++++++--- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e4a7257..40e8572 100644 --- a/README.md +++ b/README.md @@ -33,11 +33,12 @@ tl;dr - this parses PDFs with pdfplumber into text, then runs a bunch of regex o - Use a venv ``` -# linux +# macOS/linux python3 -m venv venv source venv/bin/activate # windows +python3 -m venv venv source venv/Scripts/activate ``` diff --git a/requirements.txt b/requirements.txt index c180914..17d46ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,9 @@ -pdfplumber==0.5.23 \ No newline at end of file +chardet==5.0.0 +pdfminer.six==20200517 +pdfplumber==0.5.23 +Pillow==9.2.0 +pycryptodome==3.15.0 +python-dateutil==2.8.2 +six==1.16.0 +sortedcontainers==2.4.0 +Wand==0.6.7 diff --git a/teller/model.py b/teller/model.py index 94cefb3..0451ab8 100644 --- a/teller/model.py +++ b/teller/model.py @@ -1,6 +1,5 @@ from enum import Enum - class AccountType(Enum): AMEX = 'AMEX' TD = 'TD' diff --git a/teller/pdf_processor.py b/teller/pdf_processor.py index 53bddff..b592624 100644 --- a/teller/pdf_processor.py +++ b/teller/pdf_processor.py @@ -3,6 +3,7 @@ from pathlib import Path from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta from teller.model import Transaction, AccountType TARGET_FI = 'BMO' @@ -15,7 +16,7 @@ 'txn': (r"^(?P(?:\w{3}(\.|)+ \d{1,2} ){2})" r"(?P.+)\s" r"(?P-?[\d,]+\.\d{2})(?P(\-|\s?CR))?"), - 'startyear': r'PERIOD COVERED BY THIS STATEMENT\n.+(?P-?\,.[0-9][0-9][0-9][0-9])', + 'startyear': r'PERIOD COVERED BY THIS STATEMENT\n\w+\.\s{1}\d+\,\s{1}(?P[0-9]{4})', 'openbal': r'Previous Balance.*(?P-?\$[\d,]+\.\d{2})(?P(\-|\s?CR))?', 'closingbal': r'(?:New) Balance,.* (?P-?\$[\d,]+\.\d{2})(?P(\-|\s?CR))?' }, @@ -81,6 +82,8 @@ def _parse_visa(pdf_path): opening_bal = _get_opening_bal(text, TARGET_FI) closing_bal = _get_closing_bal(text, TARGET_FI) # add_seconds = 0 + + endOfYearWarning = True # debugging transaction mapping - all 3 regex in 'txn' have to find a result in order for it to be considered a 'match' for match in re.finditer(regexes[TARGET_FI]['txn'], text, re.MULTILINE): @@ -90,13 +93,20 @@ def _parse_visa(pdf_path): date[0] = date[0].strip('.') # Aug. -> Aug date.append(str(year)) date = ' '.join(date) # ['Aug', '10', '2021'] -> Aug 10 2021 - + try: date = datetime.strptime(date, '%b %d %Y') # try Aug 10 2021 first except: # yes I know this is horrible, but this script runs once if you download your .csvs monthly, what do you want from me date = datetime.strptime(date, '%m %d %Y') # if it fails, 08 10 2021 - # checks credit balance regex + # need to account for current year (Jan) and previous year (Dec) in statements + endOfYearCheck = date.strftime("%m") + + if (endOfYearCheck == '12' and endOfYearWarning == False): + endOfYearWarning = True + if (endOfYearCheck == '01' and endOfYearWarning): + date = date + relativedelta(years = 1) + if (match_dict['cr']): print("Credit balance found in transaction: '%s'" % match_dict['amount']) amount = -float("-" + match_dict['amount'].replace('$', '').replace(',', ''))