In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import tempfile
from abc import abstractmethod
from logging import LoggerAdapter
from pathlib import Path
from typing import ClassVar

import numpy as np
from kink import inject
from sklearn.base import ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow
from bunq_ynab_connect.classification.budget_category_encoder import (
    BudgetCategoryEncoder,
)
from bunq_ynab_connect.classification.experiments.base_payment_classification_experiment import (  # noqa: E501
    BasePaymentClassificationExperiment,
)
from bunq_ynab_connect.classification.feature_extractor_old import FeatureExtractor
from bunq_ynab_connect.classification.feature_store import FeatureStore
from bunq_ynab_connect.data.storage.abstract_storage import AbstractStorage
from bunq_ynab_connect.data.storage.mongo_storage import MongoStorage
from bunq_ynab_connect.models.matched_transaction import MatchedTransaction
from mlflow.client import MlflowClient
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from math import ceil

In [None]:
storage = MongoStorage()
label_encoder = BudgetCategoryEncoder()
budget_id = "todo"
feature_store = FeatureStore()

CLASSIFIERS: ClassVar[list[ClassifierMixin]] = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    # GradientBoostingClassifier(),
    GaussianNB(),
    MLPClassifier(max_iter=1000),
    # ExplainableBoostingClassifier(),
]

N_FOLDS = 3
RANDOM_STATE = 42

# Load x, y

In [None]:
feature_store.update()

In [None]:
def load_data() -> list[MatchedTransaction]:
    """Load the dataset.

    - Load all matched transactions for the given budget
    - Convert them to MatchedTransaction entities

    Returns
    -------
        List of MatchedTransaction entities

    """
    transactions = storage.find(
        "matched_transactions",
        [("ynab_transaction.budget_id", "eq", budget_id)],
    )
    return storage.rows_to_entities(transactions, MatchedTransaction)


def transactions_to_xy(
    transactions: list[MatchedTransaction],
) -> tuple[np.array, np.array]:
    """Convert a list of MatchedTransactions to X and y.

    Returns
    -------
        X: Array of bunq payments
        y: Array of categories as integers

    """
    X = np.array([t.bunq_payment.model_dump() for t in transactions])  # noqa: N806
    y = np.array([t.ynab_transaction.model_dump() for t in transactions])
    y = label_encoder.fit_transform(y)

    # Find the value counts in y
    category_counts = pd.Series(y).value_counts()
    top_categories = category_counts.nlargest(2).index

    # Get the values that occur less than or equal to 3 times
    categories_to_drop = category_counts[category_counts <= 5].index

    # Get the indexes of these values in y
    indexes_to_drop = np.where(np.isin(y, categories_to_drop))[0]

    # Drop these indexes from X and y
    X_filtered = np.delete(X, indexes_to_drop, axis=0)
    y_filtered = np.delete(y, indexes_to_drop, axis=0)

    print(f"Dropped {len(indexes_to_drop)} items from X and y")
    
    # drop all but the top 2 categories
    indexes_to_drop = np.where(np.isin(y_filtered, top_categories, invert=True))[0]
    X_filtered = np.delete(X_filtered, indexes_to_drop, axis=0)
    y_filtered = np.delete(y_filtered, indexes_to_drop, axis=0)
    
    print(f"Dropped {len(indexes_to_drop)} items from X and y")
    

    return X_filtered, y_filtered


def run() -> None:
    """Run the experiment.

    - Load data
    - Enable autolog
        Skip logging of models, because this takes a lot of space
    - Start run and _run
    """
    transactions = load_data()
    experiment_name = get_experiment_name()
    if not len(transactions):
        print("Skipping experiment %s, because no dataset was found", experiment_name)
        return
    X, y = transactions_to_xy(transactions)  # noqa: N806
    print("Running experiment %s", experiment_name)
    print("Dataset has size %s", len(transactions))
    mlflow.set_experiment(experiment_name)
    mlflow.sklearn.autolog(log_models=False)
    with mlflow.start_run() as run:
        mlflow.set_tag("budget", budget_id)
        parent_run_id = run.info.run_id
        _run(X, y)


def get_sample_values(y):
    category_counts = pd.Series(y).value_counts()
    category_counts_counts = category_counts.rename("n").value_counts().reset_index()
    category_counts_counts.columns = ["count", "occurrences"]
    percentile_75_value = ceil(category_counts_counts["count"].quantile(0.75))

    # find all unique categories that occur more than the 75th percentile value
    categories_to_undersample = category_counts[
        category_counts > percentile_75_value
    ].index

    print(
        f"Categories to undersample to {percentile_75_value}: {categories_to_undersample}"
    )
    
    return {
        c: percentile_75_value for c in categories_to_undersample
    }


def create_pipeline(
    classifier: ClassifierMixin
) -> Pipeline:
    from bunq_ynab_connect.classification.feature_extractor_old import FeatureExtractor
    feature_extractor = FeatureExtractor()
    return Pipeline(
        [
            ("feature_extractor", feature_extractor),
            (
                "undersample",
                RandomUnderSampler(
                    sampling_strategy=get_sample_values
                ),
            ),
            ("oversample", SMOTE(k_neighbors=3)),
            ("classifier", classifier),
        ]
    )


def get_experiment_name() -> str:
    return "NOTEBOOK TESTING"


@abstractmethod
def _run(X: np.array, y: np.array) -> None:  # noqa: N803
    for classifier in CLASSIFIERS:
        with mlflow.start_run(run_name=classifier.__class__.__name__, nested=True):
            run_classifier(classifier, X, y)


def run_classifier(
    model: ClassifierMixin,
    X: np.ndarray,  # noqa: N803
    y: np.ndarray,
) -> float:
    """Run the experiment for a single classifier.

    - Create the pipeline
    - Use Kfold
    - Score and log the mean score

    """
    mlflow.set_tag("classifier", model.__class__.__name__)
    classifier = create_pipeline(model)
    k_fold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(
        classifier,
        X,
        y,
        cv=k_fold,
        n_jobs=-1,
        scoring=make_scorer(cohen_kappa_score),
    )
    mlflow.log_text(str(scores), "scores.txt")
    avg_score = np.mean(scores)
    mlflow.log_metric("cohen_kappa", avg_score)
    mlflow.sklearn.log_model(classifier, "model")
    return avg_score

In [None]:
X, y = transactions_to_xy(load_data())

In [None]:
y

In [None]:
X, y = transactions_to_xy(load_data())
# Find the value counts in y
category_counts = pd.Series(y).value_counts()

# Get the values that occur less than or equal to 3 times
categories_to_drop = category_counts[category_counts <= 3].index

# Get the indexes of these values in y
indexes_to_drop = np.where(np.isin(y, categories_to_drop))[0]

# Drop these indexes from X and y
X_filtered = np.delete(X, indexes_to_drop, axis=0)
y_filtered = np.delete(y, indexes_to_drop, axis=0)

category_counts_counts = category_counts.rename("n").value_counts().reset_index()
category_counts_counts.columns = ['count', 'occurrences']
percentile_75_value = ceil(category_counts_counts['count'].quantile(0.75))

# find all unique categories that occur more than the 75th percentile value
categories_to_undersample = category_counts[category_counts > percentile_75_value].index

print(f"Dropped {len(indexes_to_drop)} items from X and y")
print(f"Categories to undersample to {percentile_75_value}: {categories_to_undersample}")

In [None]:
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
X_extracted = FeatureExtractor().fit_transform(X_filtered)

# undersample
X_resampled, y_resampled = RandomUnderSampler(
    sampling_strategy={
        category: percentile_75_value for category in categories_to_undersample
    }
).fit_resample(X_extracted, y_filtered)

# oversample
X_resampled, y_resampled = SMOTE(k_neighbors=3).fit_resample(X_resampled, y_resampled)

In [None]:
transactions = storage.find(
    "matched_transactions",
    [("ynab_transaction.budget_id", "eq", budget_id)],
)

In [None]:
ynab = [t for t in transactions]

In [None]:
from bunq_ynab_connect.classification.feature_extractor import FeatureExtractor
test = FeatureExtractor()

test.fit(X)

In [None]:
from sklearn.model_selection import cross_validate, train_test_split

from sklearn.metrics import f1_score, cohen_kappa_score, accuracy_score, f1_score, balanced_accuracy_score
mlflow.set_experiment("Testing with sampling")
with mlflow.start_run():
    X, y = transactions_to_xy(load_data())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier()
    mlflow.set_tag("classifier", model.__class__.__name__)
    classifier = create_pipeline(model)
    k_fold = StratifiedKFold(
        n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE
    )
    metrs = ["accuracy", "f1"]
    scores = cross_validate(
        classifier,
        X_train,
        y_train,
        cv=k_fold,
        n_jobs=-1,
        scoring=metrs,
    )
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    for metric in metrs:
        mlflow.log_metric(f"{metric}_validate", np.mean(scores[f"test_{metric}"]))
        fn = globals()[f"{metric}_score"]
    mlflow.log_metric("accuracy_test", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_test", f1_score(y_test, y_pred, average="macro"))
    mlflow.log_metric("cohen_kappa_test", cohen_kappa_score(y_test, y_pred))
    mlflow.log_metric("balanced_accuracy_test", balanced_accuracy_score(y_test, y_pred))
    mlflow.sklearn.log_model(classifier, "model")

In [None]:
category_counts_df = pd.Series(y_resampled).value_counts().reset_index()
# rename count col
category_counts_df.columns = ['category', 'cat_count']

category_counts_df

In [None]:
len(y_resampled)

In [None]:
test = FeatureExtractor().fit_transform(X)

In [None]:
y

In [None]:
transactions = storage.find(
    "matched_transactions",
    [("ynab_transaction.budget_id", "eq", budget_id)],
)
transactions = storage.rows_to_entities(transactions, MatchedTransaction)

In [None]:
import pandas as pd


y_trans = [t.ynab_transaction.model_dump() for t in transactions]
# to frame
y_trans_pd = pd.DataFrame(y_trans)

In [None]:
import matplotlib.pyplot as plt

# Get the counts for each category
category_counts = y_trans_pd['category_name'].value_counts()

# Create the bar plot
category_counts.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Counts for Each Category')
plt.show()

In [None]:
count_of_counts

In [None]:
category_counts_df = y_trans_pd['category_name'].value_counts().reset_index()
# rename count col
category_counts_df.columns = ['category', 'cat_count']

count_of_counts = category_counts_df['cat_count'].value_counts().reset_index()
count_of_counts.columns = ['count', 'occurrences']
count_of_counts.sort_values('count')
percentile_75_value = count_of_counts['count'].quantile(0.75)
percentile_75_value