# Transaction Category Classifier Training

This notebook trains a TF-IDF + Logistic Regression model for categorizing transactions.


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os


In [None]:
# Load training data
training_data = [
    ("WALMART GROCERY", "groceries"),
    ("KROGER", "groceries"),
    ("STARBUCKS", "dining"),
    ("UBER TRIP", "transportation"),
    ("NETFLIX", "subscriptions"),
    # Add more training examples...
]

# Augment data
augmented_data = []
for desc, cat in training_data:
    augmented_data.append((desc, cat))
    augmented_data.append((desc.lower(), cat))
    augmented_data.append((f"POS {desc}", cat))

descriptions = [d[0] for d in augmented_data]
categories = [d[1] for d in augmented_data]

print(f"Total training samples: {len(descriptions)}")


In [None]:
# Create and train pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))
])

X_train, X_test, y_train, y_test = train_test_split(
    descriptions, categories, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
print("Model trained!")


In [None]:
# Evaluate and save
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print(f"Train: {train_score:.4f}, Test: {test_score:.4f}")

model_path = '../backend/ml_models/transaction_classifier.joblib'
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(pipeline, model_path)
print(f"Model saved to {model_path}")
