# Expense Classifier Training

This notebook trains and evaluates the expense classification model.

## Objectives:
1. Load and prepare training data
2. Train classification model
3. Evaluate model performance
4. Optimize hyperparameters
5. Save trained model

In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from backend.config.constants import EXPENSE_CATEGORIES
from backend.services.classifier_service import expense_classifier

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Training Data

In [None]:
# Load expenses from database
from backend.models.database import SessionLocal
from backend.models import tables

db = SessionLocal()

# Query all expenses with categories
expenses = db.query(tables.Expense).join(tables.Category).all()

# Convert to DataFrame
data = []
for expense in expenses:
    data.append({
        'merchant': expense.merchant or '',
        'description': expense.description or '',
        'category': expense.category.slug,
        'amount': expense.amount
    })

df = pd.DataFrame(data)
db.close()

print(f"Loaded {len(df)} expenses")
print("\nCategory distribution:")
print(df['category'].value_counts())

## 2. Data Exploration

In [None]:
# Visualize category distribution
plt.figure(figsize=(12, 6))
df['category'].value_counts().plot(kind='bar')
plt.title('Distribution of Expense Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Sample merchants by category
for category in df['category'].unique():
    print(f"\n{category.upper()}:")
    merchants = df[df['category'] == category]['merchant'].value_counts().head(10)
    print(merchants)

## 3. Feature Engineering

In [None]:
# Combine text features
df['text'] = df['merchant'] + ' ' + df['description']
df['text'] = df['text'].str.lower().str.strip()

# Remove empty texts
df = df[df['text'].str.len() > 0]

print(f"Data after cleaning: {len(df)} records")
print("\nSample texts:")
print(df['text'].head(10))

## 4. Train-Test Split

In [None]:
# Split data
X = df['text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print("\nClass distribution in training:")
print(y_train.value_counts())

## 5. Model Training

In [None]:
# Train Naive Bayes model
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=500, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])

print("Training Naive Bayes model...")
nb_pipeline.fit(X_train, y_train)
nb_score = nb_pipeline.score(X_test, y_test)
print(f"✅ Naive Bayes Accuracy: {nb_score:.2%}")

In [None]:
# Train Random Forest model
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=500, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

print("Training Random Forest model...")
rf_pipeline.fit(X_train, y_train)
rf_score = rf_pipeline.score(X_test, y_test)
print(f"✅ Random Forest Accuracy: {rf_score:.2%}")

## 6. Model Evaluation

In [None]:
# Select best model
best_model = nb_pipeline if nb_score > rf_score else rf_pipeline
model_name = "Naive Bayes" if nb_score > rf_score else "Random Forest"

print(f"Best model: {model_name}")

# Predictions
y_pred = best_model.predict(X_test)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
categories = sorted(df['category'].unique())

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=categories, yticklabels=categories)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## 7. Cross-Validation

In [None]:
# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5)

print("Cross-validation scores:")
for i, score in enumerate(cv_scores, 1):
    print(f"  Fold {i}: {score:.2%}")

print(f"\nMean: {cv_scores.mean():.2%} (+/- {cv_scores.std() * 2:.2%})")

## 8. Feature Importance (for Random Forest)

In [None]:
# If Random Forest was best
if isinstance(best_model.named_steps['clf'], RandomForestClassifier):
    feature_names = best_model.named_steps['tfidf'].get_feature_names_out()
    importances = best_model.named_steps['clf'].feature_importances_
    
    # Get top 20 features
    indices = np.argsort(importances)[-20:]
    
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(indices)), importances[indices])
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Importance')
    plt.title('Top 20 Most Important Features')
    plt.tight_layout()
    plt.show()

## 9. Test on Sample Data

In [None]:
# Test predictions on sample texts
test_samples = [
    "walmart groceries",
    "shell gas station",
    "netflix subscription",
    "gym membership",
    "amazon purchase",
    "electric bill",
    "mcdonalds lunch"
]

print("Sample Predictions:")
print("=" * 60)

for text in test_samples:
    prediction = best_model.predict([text])[0]
    probabilities = best_model.predict_proba([text])[0]
    confidence = max(probabilities)
    
    print(f"{text:30s} → {prediction:15s} ({confidence:.1%})")

## 10. Save Model

In [None]:
# Save the trained model
import pickle
from pathlib import Path

model_path = Path('../backend/ml/saved_models/classifier.pkl')
model_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"✅ Model saved to {model_path}")

# Save model metadata
metadata = {
    'model_type': model_name,
    'accuracy': best_model.score(X_test, y_test),
    'cv_mean': cv_scores.mean(),
    'cv_std': cv_scores.std(),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'categories': list(categories)
}

print("\nModel Metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

## Summary

### Model Performance
- **Algorithm**: [Model Name]
- **Accuracy**: [XX.X%]
- **Cross-Validation**: [XX.X% ± YY.Y%]

### Next Steps
1. Collect more training data
2. Experiment with different features
3. Try ensemble methods
4. Implement active learning
5. Monitor model performance in production