In [1]:
import csv
import pandas as pd

import numpy as np

df = pd.read_csv("transactions.csv", sep=";")

def numberfy(x):
    x = float(x.replace(",","."))
    x = abs(int(x))
    return x

df.Betrag = df.Betrag.apply(numberfy)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, OneHotEncoder

X_train, X_test, y_train, y_test = train_test_split(df[['Verwendungszweck', "Betrag", "Buchungstext"]], df['label'], random_state=1)


print('Number of rows in the total set: {}'.format(df.shape[0]))

print('Number of rows in the training set: {}'.format(X_train.shape[0]))

print('Number of rows in the test set: {}'.format(X_test.shape[0]))


text_transformer = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer())
                            ])

betrag_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ("dicretizer", KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform'))])


category_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

Number of rows in the total set: 209
Number of rows in the training set: 156
Number of rows in the test set: 53


In [3]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', betrag_transformer, ["Betrag"]),
        ('text', text_transformer, "Verwendungszweck"),
        ('categories', category_transformer, ["Buchungstext"])])


In [4]:
from sklearn.naive_bayes import MultinomialNB

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MultinomialNB())])



In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))

print('Precision score(weighted): ', format(precision_score(y_test, predictions,average='weighted')))

print('Recall score(weighted): ', format(recall_score(y_test, predictions,average='weighted')))

print('F1 score(weighted): ', format(f1_score(y_test, predictions,average='weighted')))

Accuracy score:  0.9056603773584906
Precision score(weighted):  0.9318658280922432
Recall score(weighted):  0.9056603773584906
F1 score(weighted):  0.9041520910054082
