# 0. Dependencies

In [1]:
import os
os.chdir("/Users/billydodds/Documents/Projects/Transactions_ML")

import re
from typing import Dict, List, Any, Tuple, Callable, Union

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from components.scripts.process_data import clean, clean_chop, get_corpora, get_lookup, get_webscrape, NLP_distances
from components.scripts.load_data import load_data


In [2]:

# Set path
path = "./components/private_files/"

# Load in labels and join to data
data = load_data(path)
testing = data.iloc[0:50]
testing.category = [np.nan]*len(testing.category)
training = data.iloc[50::]

model = DecisionTreeClassifier()


In [86]:
def add_to_corpora(corpora:Dict, row:pd.DataFrame):
    row = row.to_dict()
    category = list(row["category"].values())[0]
    corpus = list(row["desc_corpus"].values())[0]
    corpora[category] = list(
        set(corpora[category]).union(set(corpus.split(" ")))
    )
    return corpora
    


In [89]:
testing
training

last_added = training.tail(1)

# Get Webscraping
if first:
    google_preds = get_webscrape(testing, path)
else:
    google_preds = [np.nan]*len(testing)

# Join from lookup table
lookup_preds = get_lookup(training, testing)

# Train model
if first:
    corpora = get_corpora(training)
else:
    corpora = add_to_corpora(corpora, last_added)


X_train = NLP_distances(training, corpora)
y_train = X_train.category
X_train = X_train.drop('category', axis=1)

X_test = NLP_distances(testing, corpora).drop('category', axis=1)

model.fit(X_train, y_train)
model_preds = model.predict(X_test)

X_test['model_preds'] = model_preds
X_test['google_preds'] = google_preds
X_test['lookup_preds'] = lookup_preds

# Prioritise the lookup values over the google values
corrections = [look if str(look) != "nan" else goog for goog, look in zip(google_preds, lookup_preds)]

# Take correction if present, else go with model pick
X_test['pred_category'] = [corr if str(corr) != "nan" else mod for corr, mod in zip(corrections, model_preds)]

# Don't bother reviewing a label that has been found through lookup or google
X_test['certain'] = [True if str(goog) != "nan" or str(look) != "nan" else False for goog, look in zip(google_preds, lookup_preds)]

print(f"{sum(X_test.certain)} of {len(X_test.certain)} predictions were found without the model.\n")

# Merge categories back on raw data
X_test = data.merge(X_test, left_index=True, right_index=True, how="inner").drop("category", axis=1)

X_test = X_test.rename(columns={'amount_x':'amount', 'pred_category':'category', 'weekday_x':'weekday'})
base_testing = X_test[['amount', 'description', 'category', 'date', 'weekday', 'desc_corpus', 'desc_features', 'certain']]


certain = base_testing[base_testing.certain].drop("certain", axis=1)
certain[['amount']].to_csv(path + "transactions_labelled.csv", mode = 'a', header = False, index=False)

# Append certainties to training for next iteration, redefine test set
training = training.append(certain).sort_index()
testing = base_testing[~base_testing.certain].drop("certain", axis=1)


# certain = certain[["date", "amount", "description", "category"]]
# Add certain transactions to transactions_labelled.csv
# 



# X_test = X_test[["date", "amount_x", "description", "pred_category", "certain"]]

# # X_test.columns = ["date", "amount", "description", "pred_category", "certain"]
# # certain = X_test[X_test.certain].drop("certain", axis=1)
# # certain = certain[["date", "amount", "description", "pred_category"]]
# # Add certain transactions to transactions_labelled.csv
# # certain.to_csv(path + "transactions_labelled.csv", mode = 'a', header = False, index=False)



29 of 50 predictions were found without the model.

     amount                                        description category  \
0      50.0  direct credit 002962 citigroup ptyltd paddy walsh    wages   
1      -4.8           mcdonalds gladesville gladesville  nswau     food   
2      -4.0           mcdonalds gladesville gladesville  nswau     food   
3      50.0   transfer from margaret a jorgensen izzy tutoring    wages   
4     -14.5           chargrill mosman         mosman       au     food   
..      ...                                                ...      ...   
595   -36.0  liquorland 3638 glebe au aus card xx6725 value...    beers   
596   -42.2  uberuae_eats sydney au aus card xx6725 value d...     food   
597   -13.0  eastern suburbs dist bellevue hill ns aus card...    beers   
598    19.0  return uberuae_eats sydney au aus card xx6725 ...     food   
599    -5.0           mcdonaldscamperdownca camperdown   nswau     food   

          date  weekday                        

In [53]:
# Define instructions

classes = {1:'beers', 2:'food', 3:'life/wellbeing', 4:'shopping', 5:'transfer', 6:'transport', 7:'wages'}
instr = "\n[↵] accept\t"
for key, value in classes.items():
    instr = instr + f"[{key}] {value}\t"

instr += "\n"

X_test = X_test.rename(columns={'amount_x':'amount', 'pred_category':'category', 'weekday_x':'weekday'})
base_testing = X_test[['amount', 'description', 'category', 'date', 'weekday', 'desc_corpus', 'desc_features']]


In [None]:
def get_correction(uncertain, instr):
    print("-"*130)
    print(f"{len(uncertain)} transactions to go")
    print(instr)
    print(uncertain.iloc[[-1], :], "\n")
    while True:
        inp = input("Correct class: ")
        if inp == "":
            break
        try:
            int(inp)
        except ValueError:
            print(f"Input must be a number (the things in square brackets). Try again")
            continue
        if int(inp) in classes.keys():
            break
        else:
            print(f"Input must be a valid number (1-7, or blank if you back my model). Try again")
            continue
    changes.append(inp)

In [None]:


lookup_preds = get_lookup(training, testing)
corpora = get_corpora(training)
X_train = NLP_distances(training, corpora)
y_train = X_train.category
X_train = X_train.drop('category', axis=1)

X_test = NLP_distances(testing, corpora).drop('category', axis=1)

model.fit(X_train, y_train)
model_preds = model.predict(X_test)

X_test['model_preds'] = model_preds
X_test['google_preds'] = google_preds
X_test['lookup_preds'] = lookup_preds



In [6]:
testing.at[0, "amount"]

50.0

In [9]:
print(str(np.NaN))

nan
