# Smart Expense Categorizer

This notebook classifies short transaction messages into categories such as Food, Travel, Shopping, and Education using TF-IDF vectorization and Logistic Regression.


In [37]:
!pip install scikit-learn pandas joblib



In [38]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [39]:
# Sample labeled transaction messages
data = {
    'text': [
        'Swiggy 300', 'Zomato 250', 'Dominos 500',
        'Uber 200', 'Ola 150', 'Bus 70', 'Train 120',
        'Book 300', 'Tuition 2000', 'Course 500',
        'Electricity Bill 700', 'Rent 5000', 'Groceries 1200', 'Movie 350'
    ],
    'category': [
        'Food', 'Food', 'Food',
        'Travel', 'Travel', 'Travel', 'Travel',
        'Education', 'Education', 'Education',
        'Bills', 'Bills', 'Groceries', 'Entertainment'
    ]
}

df = pd.DataFrame(data)
print(df)



                    text       category
0             Swiggy 300           Food
1             Zomato 250           Food
2            Dominos 500           Food
3               Uber 200         Travel
4                Ola 150         Travel
5                 Bus 70         Travel
6              Train 120         Travel
7               Book 300      Education
8           Tuition 2000      Education
9             Course 500      Education
10  Electricity Bill 700          Bills
11             Rent 5000          Bills
12        Groceries 1200      Groceries
13             Movie 350  Entertainment


In [40]:
df['text'] = df['text'].str.lower()

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [41]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.0
              precision    recall  f1-score   support

       Bills       0.00      0.00      0.00       1.0
   Education       0.00      0.00      0.00       1.0
        Food       0.00      0.00      0.00       1.0
      Travel       0.00      0.00      0.00       0.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
test_samples = ['Swiggy 300', 'Bus 70', 'Book 300']
test_samples = [t.lower() for t in test_samples]

X_test_custom = vectorizer.transform(test_samples)
predictions = model.predict(X_test_custom)

for text, label in zip(test_samples, predictions):
    print(f"{text} → {label}")

swiggy 300 → Travel
bus 70 → Travel
book 300 → Education


In [43]:
joblib.dump(model, "expense_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model saved successfully!")

Model saved successfully!
