In [None]:
from dotenv import find_dotenv, load_dotenv
from kink import di
from bunq_ynab_connect.data.storage.abstract_storage import AbstractStorage
from bunq_ynab_connect.models.matched_transaction import MatchedTransaction
import numpy as np

import pandas as pd
from sklearn_pandas import DataFrameMapper
import os

load_dotenv(find_dotenv())

In [21]:
budget_id = os.environ["BUDGET_ID_FOR_TESTING"]
storage = di[AbstractStorage]
label_column = "category_id"

In [14]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
from bunq_ynab_connect.classification.budget_category_encoder import (
    BudgetCategoryEncoder,
)
from feature_engine.datetime import DatetimeFeatures


def get_transactions() -> list[MatchedTransaction]:
    transactions_raw = storage.find(
        "matched_transactions",
        [("ynab_transaction.budget_id", "eq", budget_id)],
    )
    transactions = storage.rows_to_entities(transactions_raw, MatchedTransaction)
    return transactions


def transactions_to_xy(
    transactions: list[MatchedTransaction],
) -> tuple[pd.DataFrame, pd.DataFrame]:
    X = pd.DataFrame([t.bunq_payment.model_dump() for t in transactions])
    y = pd.DataFrame([t.ynab_transaction.model_dump() for t in transactions])
    return X, y


def encode_y(y: pd.DataFrame) -> np.ndarray:
    mapper = DataFrameMapper(
        [(["category_id", "category_name"], BudgetCategoryEncoder())],
        input_df=True,
        df_out=False,
    )
    transformed: np.ndarray = mapper.fit_transform(y)
    return transformed

def date_extractor() -> TransformerMixin:
    return DataFrameMapper(
        [
            (
                ["created"],
                DatetimeFeatures(
                    variables=["created"]
                ),
                {"alias": "date_feature"},
            ),
        ],
        input_df=True,
        df_out=True,
    )


def pipeline(X: pd.DataFrame) -> np.ndarray:
    features = FeatureUnion([
        ("date", date_extractor()),
    ]).fit_transform(X)
    return features


transactions = get_transactions()
X, y = transactions_to_xy(transactions)
y_encoded = encode_y(y)
features = pipeline(X)