In [None]:
from pathlib import Path
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import unidecode
import pandas as pd
import logging

logger = logging.getLogger(__name__)

In [None]:
pages_path = Path("./pages")

In [None]:
transactions = []
pbar = tqdm(list(pages_path.glob("*")))
for p in pbar:
    pbar.set_description(p.as_posix(), refresh=True)
    with p.open("r") as f:
        page_str = f.read()
    
    soup = BeautifulSoup(page_str, 'lxml')

    table = soup.find("table", {"id": 'transaction-table'})
    table_body = table.find('tbody')

    rows = table_body.find_all('tr')
    for row in tqdm(rows, leave=False):
        if row.get("ng-repeat-start") != 'operation in operations':
            continue

        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]

        try:
            _, data_transakcji, data_ksiegowania, odbiorca_nadawca, tytul, kwota, _, saldo, podkategoria = cols
        except Exception as e:
            print(e)
            logger.error(e)
        transaction_data = {
            "data_transakcji": data_transakcji,
            "data_ksiegowania": data_ksiegowania,
            "odbiorca_nadawca": odbiorca_nadawca,
            "tytul": tytul,
            "kwota": unidecode.unidecode(kwota),
            "saldo": unidecode.unidecode(saldo),
            "podkategoria": podkategoria,
        }
        transactions.append(transaction_data)



In [None]:
len(transactions)

In [None]:
df = pd.DataFrame(transactions)
df

In [None]:
df.dtypes

In [None]:
df_transformed = df.assign(saldo=df["saldo"].apply(lambda x: x.split(" PLN")[0].replace(" ", '').replace(",", ".")).replace("-.--", None).astype(float))
df_transformed["kwota"] = df_transformed["kwota"].apply(lambda x: x.split(" PLN")[0].replace(" ", '').replace(",", ".")).astype(float)
df_transformed["data_transakcji"] = pd.to_datetime(df_transformed["data_transakcji"])
df_transformed["data_ksiegowania"] = pd.to_datetime(df_transformed["data_ksiegowania"])
df_transformed.index.name = "numer_transakcji"
df_transformed

In [None]:
df_transformed.to_csv("wydatki.csv")

In [None]:
df_kategorie = pd.read_csv("kategorie.csv")
df_kategorie

In [None]:
df_merged = df_transformed.merge(df_kategorie, on="podkategoria")
df_merged

In [None]:
df_merged.to_csv("wydatki_kategorie.csv")