# Prediction of "Sachkonto" - Trainingsdata from SAP BSAK

Environment: "catboostEnv", see README.md

### Possible improvements
7. Unbalanced data-set: find some strategy to counter this
8. use CATBOOST instead of XGBoost - just for comparison
9. Optimization of model with "AutoML" / (Bayesian) Parameter-Search of some sort (possibly better than grid-search )
10. Consider to use Scikit columnTransformer and pipeline instead of doing everything manually: this would make the entire pipeline testable.
11. Split data preparation into separate notebook


# Imports

In [1]:
import pandas as pd
import numpy as np
from utils_bsak import printSamplesFromSaktos
from utils_bsak import is_date_column, is_decimal_column, convert_column_decimal2float, listMostlyNanColumns, listMostlyNullColumns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils_bsak import target_min_value_records

import joblib

# Load Data

In [None]:
path_data_folder = "../data_raw"

path_file_csv1 = r"\Export_bsak_0124-prctr.csv".replace("\\", "/")
path_file_csv2 = r"\Export_bsak_0224-prctr.csv".replace("\\", "/")
path_file_csv3 = r"\export_bsak_010123-311223-SaktoExclude.csv".replace("\\", "/")

path_files_csv = [path_file_csv1, path_file_csv2, path_file_csv3]

df = pd.DataFrame()
for path_file in path_files_csv:
    path_data = path_data_folder + path_file
    df_path = pd.read_csv(path_data, sep=';', encoding='latin1')
    df = pd.concat([df, df_path], ignore_index=True)

# Erste Zeilen anzeigen, um sicherzustellen, dass die Daten korrekt geladen wurden
print(df.head(2))


## Make Catboost Dataset

In [3]:
target = 'Sachkonto'
X = df.drop(columns=[target])
y = df[target]

In [None]:
# no datetime-columns:
df.dtypes.unique()

In [None]:
target = 'Sachkonto'
proceed = target in df.columns
#proceed = False

if proceed:
    # drop empty, i.e. nan-only rows:
    df = df.dropna(axis=0, how='all')

    # # drop empty, i.e. nan-only columns:
    # df = df.dropna(axis=1, how='all')

    # drop all columns with more than 95% null values:
    df=df.drop(columns=listMostlyNullColumns(df, fraction = 0.95))

    # drop all columns with more than 95% nan values:
    df=df.drop(columns=listMostlyNanColumns(df, fraction=0.95))

    # drop duplicate rows:
    df = df.drop_duplicates()

    # drop rows with target being nan:
    df = df.dropna(subset=[target], axis=0)

    # keep only rows with target not being empty string:
    df = df[df[target] != ""]

    # keep only columns with non-constant value:
    df = df.loc[:, df.nunique() > 1]

    # remove columns containing dates - no time-series analysis components in this notebook:
    # define a function to identify date columns:
    import re

    date_cols = [col for col in df.columns if is_date_column(df[col])]
    df = df.drop(columns=date_cols)

    # Identify decimal columns:
    decimal_cols = [col for col in df.columns if is_decimal_column(df[col]) and df[col].dtype != 'int64']

    # Convert decimal-colums to floats:
    for col in decimal_cols:
        df[col] = convert_column_decimal2float(df[col])

    # Label-encode all columns of type "object":
    from sklearn.preprocessing import LabelEncoder

    """ columns_to_encode = df.dtypes[df.dtypes == "object"].index.to_list()
    column_encoders = {}
    label_encoder = LabelEncoder()

    for col in columns_to_encode:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col].astype(str))
        # save encoding for each column
        column_encoders[col] = label_encoder """


    # drop columns that contain only identifiers (that we do not want to analyze further here):
    identifier_columns = ["Referenz", "Ausgleichsbeleg", "Zuordnung", "Belegnummer", "Auftrag", "Einkaufsbeleg", "Rechnungsbezug"]
    
    for id_col in identifier_columns:
        if id_col in df.columns:
            df = df.drop(id_col, axis='columns')
   
    # finally drop columns that contain data that we can not / do not want to analyze - drop this columns:
    unwanted_columns = ["Text", "Zuordnung", "Hauptbuchkonto"]
    nan_columns = list(df.columns[df.isna().sum() >0].values)
    unwanted_columns.extend(nan_columns)
    for u_col in unwanted_columns:
        if u_col in df.columns:
            df.drop(columns=[u_col], inplace=True)

print("Done!")


In [None]:
df.shape

In [None]:
df.isnull().sum().sum()

In [None]:
df.dtypes.unique()

In [None]:
df.select_dtypes(['O'])

## Stratified Train-Test Split:

In [None]:
# stratified split of data

target = 'Sachkonto'

# for stratification, all target classes have to have more than 1 record - we choose 4 as minimum here:
stratifiable_target_values = target_min_value_records(dataframe=df, target_column=target, min_value_records=4).astype(int)

print("--------------------------------")
print(f"df.shape with all targets: {df.shape}")
df = df[df[target].isin(stratifiable_target_values)]
print(f"df.shape with stratifiable targets: {df.shape}")
print("--------------------------------")

# prepare target encoding:
target_label_encoder = LabelEncoder()
y = df[target]
target_label_encoder.fit(y)

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=y)

print("--------------------------------")
print(f"df_train.shape : {df_train.shape}")
print(f"df_test.shape : {df_test.shape}")
print("--------------------------------")

# split target from data:
X_train = df_train.drop(target, axis=1)
X_test = df_test.drop(target, axis=1)
y_train = target_label_encoder.transform(df_train[target].values)
y_test = target_label_encoder.transform(df_test[target].values)

print("--------------------------------")
print("y_train unique values:")
print(np.sort(np.unique(y_train)))
print("y_test unique values:")
print(np.sort(np.unique(y_test)))
print("--------------------------------")


## Train Catboost Model

In [None]:
from catboost import CatBoostClassifier, Pool


# Pool erstellen (für mehr Kontrolle)
train_pool = Pool(data=X_train, label=y_train, cat_features=["Belegart", "Soll/Haben-Kennz.", "Vorgangsart GL", "Referenzvorgang"])

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=100,
    early_stopping_rounds=50
)

model.fit(train_pool)

In [None]:
from catboost import CatBoostClassifier, Pool

# Beispiel: welche Spalten sind kategorisch?
cat_features = df.select_dtypes(['object']).columns.tolist()

# Optional: Datumsspalten verarbeiten (wenn du nicht CatBoost automatisch arbeiten lassen willst)
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['year'] = df['transaction_date'].dt.year
df['month'] = df['transaction_date'].dt.month
df['weekday'] = df['transaction_date'].dt.weekday

# Target und Features
X = df.drop(columns=['target'])
y = df['target']

# Pool erstellen (für mehr Kontrolle)
train_pool = Pool(data=X, label=y, cat_features=cat_features)

## Use Boruta to find the most relevant features in the dataset

In [15]:
#Use Boruta after the split to avoid data leakage

# for notebook control:
apply_Boruta = False
#apply_Boruta = True

if(apply_Boruta):

    from boruta import BorutaPy
    from sklearn.ensemble import RandomForestClassifier

    # Define the Random Forest model
    rf_model = xgb.XGBClassifier(n_jobs=-1, verbosity=0)

    boruta_selector = BorutaPy(rf_model, n_estimators='auto', random_state=42)
    boruta_selector.fit(X_train.values, y_train)

    # Check the results
    selected_features = X_train.columns[boruta_selector.support_].tolist()
    print("--------------------------------")
    print("Selected Features:", selected_features)

    # Optional: Features that were rejected
    rejected_features = X_train.columns[~boruta_selector.support_].tolist()
    print("Rejected Features:", rejected_features)
    print("--------------------------------")

### All datasets combined Boruta selected_features (runtime: 21m 40s):
Selected Features: ['Buchungskreis', 'Lieferant', 'Position', 'WÃ¤hrung', 'Belegart', 'Buchungsperiode', 'Steuerkennzeichen', 'Betrag', 'Funktionale WÃ¤hrung', 'Zahlungsbedingung', 'Tage 1', 'Skontoprozentsatz 1', 'Skontobasis', 'Skontobetrag', 'Skontobetrag.1', 'Zahlweg', 'Zahlungssperre', 'Hausbank', 'Partnerbanktyp', 'Steuerkennzeichen.1', 'Steuerkennzeichen.2', 'HW-2-Betrag', 'Skontobetrag HW2', 'ReferenzschlÃ¼ssel 2', 'WÃ¤hrung Hauptbuch', 'Betrag Hauptbuch', 'Profitcenter', 'Position im Sender System', 'Währung', 'Funktionale Währung', 'Währung Hauptbuch']

# Train XGBoost Model

## Define a factory-function for the model

In [16]:
# define a factory-function for the model - to always work with the same type of model (... could be implemented as singleton...)
def create_Model():
    return xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

## Define Selected Features

Using the results from the Boruta-analysis - define different sets of columns to train on. <br>
Since this is a demonstrator I tried to use different features to prove flexibility. <br>
Which columns to use is dependent on the front-end application in which the model is going to be used. <br>

In [None]:

from sklearn.model_selection import train_test_split

# Using the results from the Boruta-analysis - define different sets of columns to train on.
# Since this is a demonstrator I tried to use different features to prove flexibility.
# Which columns to use is dependent on the front-end application in which the model is going to be used.

# all combined:
#all_combined =  ['Buchungskreis', 'Lieferant', 'Position'] # Accuracy (in %): 96.62 +/- 2.62 --- "Position" is weird!!!
all_combined =  ['Buchungskreis', 'Lieferant', 'Steuerkennzeichen']


## Train on Selected Features

In [None]:
# chose one of the feature selections:
final_seleced_features =  all_combined

X_train = X_train[final_seleced_features]
X_test = X_test[final_seleced_features]

# automatic feature selection from Boruta selected_features:
""" if selected_features and len(selected_features) > 3:
    X_train = X_train[selected_features[:3]]
    X_test = X_test[selected_features[:3]]
    print("--------------------------------")
    print(f"selected_features[:3]: {selected_features[:3]}")
    print("--------------------------------")
else:
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
    print("--------------------------------")
    print(f"selected_features: {selected_features}") 
    print("--------------------------------") """


print("--------------------------------")
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape : {X_test.shape}")
print("--------------------------------")
print(f"X_train.head(3):{X_train.head(3)}")
print(f"X_test.head(3) : {X_test.head(3)}")
print("--------------------------------")

model = create_Model()
model.fit(X_train, y_train)


# Model Quality Assessment

## Simple Run Accuracy

In [None]:
# make predictions for test data
print("--------------------------------")
print(f"We have {y_test.shape[0]} rows of test-data.")
print("--------------------------------")

y_pred = model.predict(X_test)
""" 
print("--------------------------------")
print(y_pred[:10])
print("--------------------------------")
#predictions = [round(value) for value in y_pred] """

# evaluate predictions
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
#accuracy = accuracy_score(y_test, predictions)
print("--------------------------------")
print("Simple One-Run Accuracy: %.2f%%" % (accuracy * 100.0))
print("--------------------------------")

## Check Scikits classification_report:

In [None]:
from sklearn.metrics import classification_report

# Evaluate the model with precision, recall, and F1-score
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_label_encoder.classes_.astype(str)))
print("--------------------------------")

## Accuracy with Cross-Validation

In [None]:
import warnings

warnings.simplefilter('ignore', UserWarning)

from sklearn.model_selection import cross_val_score, KFold

X = df.drop(target, axis=1)
y_trans = target_label_encoder.fit_transform(y).astype(int)
print("----------- Check encoding ---------------------")
print(y_trans[:10])
print("--------------------------------")

results = cross_val_score(model, X, y_trans, cv=5)
print("--------------------------------")
print(f"Accuracy (in %): {results.mean() * 100:.2f} +/- {results.std() * 100:.2f}")
print("--------------------------------")

## Confusion Matrices

### Straight Confusion Matrix

In [None]:
from utils_bsak import plot_confusion_matrix

y_pred = model.predict(X_test)

plot_confusion_matrix(y_test=y_test, y_pred=y_pred, labels=target_label_encoder.classes_)

### Top-k Confusion Matrix

In [None]:
from utils_bsak import plot_top_k_confusion_matrix

k = 3
y_pred_prob = model.predict_proba(X_test)
plot_top_k_confusion_matrix(y_test=y_test, y_pred_prob=y_pred_prob, labels=target_label_encoder.classes_, top_k=k, show_off_top_k_info=False)

# Export to ONNX

In [None]:
import xgboost as xgb
from onnxmltools import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

# rename the columns of X to make Onnx conversion possible:
X_old_columns = { f"f{i}" : col for i, col in enumerate(X_train.columns)}
X_train.columns = [f"f{i}" for i in range(X_train.shape[1])]

# DMatrix (i.e. dense-matrix) erstellen und enable_categorical setzen
dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)


# Assuming you have a trained XGBClassifier model
model = create_Model()
model.fit(X_train, y_train)

# Convert the model to ONNX format
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_xgboost(model, initial_types=initial_type)

# Save the ONNX model to a file in D:\1000_DataScience_MachineLearning\1000_ML_Projects\1000_Github_ML_2\projects_planned\Psinova_Examples\saktoBsik_private :

onnx_model_name = "model_" + "Sachkonto_stratified_All3" + ".onnx"

with open(onnx_model_name, "wb") as f:
    f.write(onnx_model.SerializeToString())

print("------------------------------------------------------------------------")
print(f"model name: {onnx_model_name}")
print(f"Sachkonten Codierung für das Model : {target_label_encoder.classes_}")
print("------------------------------------------------------------------------")



In [None]:
# Store data and LabelEncoder for later use:
import joblib, onnxmltools, xgboost

data_folder = "data_preprocessed/"
file_joblib_dump = 'OnnxParams_Sachkonto_stratified_All3.pkl'

onnx_parameters = {
    "onnx_model_name" : onnx_model_name,
    "trained_features" : final_seleced_features,
    "xgboost_version" : xgboost.__version__,
    "onnxmltools_version" : onnxmltools.__version__,
}

joblib.dump(onnx_parameters, data_folder + file_joblib_dump)

In [None]:
import xgboost
print(xgboost.__version__)

#Answer:
# 1.4.2

In [None]:
import onnxmltools
print(onnxmltools.__version__)

# Answer:
# 1.7.0

Working combination of versions of xgboost and onnxmltools:

xgboost : 1.4.2 <br>
onnxmltools : 1.7.0

# TO DO : 
+ explain Classification Report
+ deal with imbalance

#### Note: Strategies to Address Imbalance 

+ Cost-Sensitive Learning: Assign higher misclassification costs to the minority class, encouraging the model to consider it more seriously.
+ Alternative Splitting Criteria: Use metrics like Hellinger distance (?) instead of traditional ones like information gain, as it better handles skewed distributions (why?).
+ Sampling Techniques: Balance the dataset by oversampling the minority class or undersampling the majority class, or by using wrapper frameworks that combine sampling with the splitting metric.
+ Adjusted Evaluation Metrics: Accuracy alone is misleading in imbalanced settings. Instead, prioritize metrics like precision, recall, and F1-score to assess the model’s performance on the minority class more accurately.