# Smart Expense Categorizer — ML Model

Goal:
Train an NLP-based classifier that predicts a normalized expense category TF-IDF
based on item descriptions.

Input: Item (text)
Output: Category (normalized)


In [1]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing & ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models (you will choose one later)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Utilities
import re
# Built-in Python module
# Used for text cleaning
# No install needed
import joblib
# Used to save ML models to disk
# Faster & safer than pickle for ML objects
# save trained model
# load it again in backend

In [2]:
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2

RANDOM_STATE = 42

# Makes results reproducible
# Same split every time
# Industry standard
# “Why 42?”
# Answer:
# “Any fixed number works; it ensures reproducibility.”


In [3]:
DATA_PATH = "../data/raw/primary_spending_patterns_detailed.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Customer ID,Category,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,CUST_0159,Groceries,Milk,1,1.28,1.28,Debit Card,Mobile App,2024-11-13
1,CUST_0017,Friend Activities,Dinner with Friends,2,74.69,149.39,Debit Card,In-store,2023-05-04
2,CUST_0094,Housing and Utilities,Water Bill,1,76.06,76.06,Digital Wallet,Mobile App,2023-12-16
3,CUST_0162,Fitness,Yoga Class,5,11.24,56.18,Cash,In-store,2024-01-28
4,CUST_0129,Gifts,Flowers,3,43.71,131.12,Debit Card,In-store,2023-07-28


In [7]:
df[["Item", "Category"]].head()


Unnamed: 0,Item,Category
0,Milk,Groceries
1,Dinner with Friends,Friend Activities
2,Water Bill,Housing and Utilities
3,Yoga Class,Fitness
4,Flowers,Gifts


In [8]:
category_mapping = {
'Groceries':'Food',
'Food':'Food',
'Shopping':'Shopping',
'Subscriptions':'Bills',
'Housing and Utilities':'Bills',
'Transportation':'Transport',
'Hobbies':'Entertainment',
'Friend Activities':'Entertainment',
'Travel':'Entertainment',
'Personal Hygiene':'Personal',
'Fitness':'Personal',
'Medical/Dental':'Other',
'Gifts':'Other',
}

In [9]:
df['Normalized Category'] = df['Category'].map(category_mapping)

In [11]:
# To check if after Normalization any data is null
print(df[df['Normalized Category'].isnull()])

Empty DataFrame
Columns: [Customer ID, Category, Item, Quantity, Price Per Unit, Total Spent, Payment Method, Location, Transaction Date, Normalized Category]
Index: []


In [12]:
df['Normalized Category'].value_counts()

Normalized Category
Entertainment    2240
Food             1593
Personal         1567
Other            1559
Bills            1504
Shopping          775
Transport         762
Name: count, dtype: int64

In [14]:
df['Normalized Category'].nunique()

7

In [15]:
X = df['Item'] # raw text
y = df['Normalized Category'] # labels

### Step 1: Train + Temp

In [18]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.4,
    random_state=42,
    stratify=y
)

stratify=y <br>
If your target variable `y` is **categorical / classification labels**, stratification ensures that:

- `X_train` and `X_temp` have **roughly the same percentage of each class**
- You don’t end up with one split missing a class or being heavily imbalanced


### Step 2: Validation + Test

In [19]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)


In [23]:
# To Test the proportions
print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


Normalized Category
Entertainment    0.224000
Food             0.159333
Personal         0.156667
Other            0.156000
Bills            0.150333
Shopping         0.077500
Transport        0.076167
Name: proportion, dtype: float64
Normalized Category
Entertainment    0.2240
Food             0.1590
Personal         0.1565
Other            0.1560
Bills            0.1505
Shopping         0.0775
Transport        0.0765
Name: proportion, dtype: float64
Normalized Category
Entertainment    0.2240
Food             0.1595
Personal         0.1570
Other            0.1555
Bills            0.1505
Shopping         0.0775
Transport        0.0760
Name: proportion, dtype: float64


In [22]:
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())


Normalized Category
Entertainment    1344
Food              956
Personal          940
Other             936
Bills             902
Shopping          465
Transport         457
Name: count, dtype: int64
Normalized Category
Entertainment    448
Food             318
Personal         313
Other            312
Bills            301
Shopping         155
Transport        153
Name: count, dtype: int64
Normalized Category
Entertainment    448
Food             319
Personal         314
Other            311
Bills            301
Shopping         155
Transport        152
Name: count, dtype: int64
