In [None]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 493.7 kB/s eta 0:05:03
   ---------------------------------------- 0.5/150.0 MB 493.7 kB/s eta 0:05:03
   ---------------------------------------- 0.8/150.0 MB 633.2 kB/s eta 0:03:56
   ---------------------------------------- 1.0/150.0 MB 699.0 kB/s eta 0:03:34
   ---------------------------------------- 1.0/150.0 MB 699.0 kB/s eta 0:03:34
   ---------------------------------------- 1.3/150.0 MB 729.7 kB/s eta 0:03:24
   --------------

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv("synthetic_transaction_data.csv")

# Feature engineering: extract month and day of week from date
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.dayofweek

# Define features and target
X = df[["transaction_description", "amount", "month", "day_of_week"]]
y = df["category"]

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Preprocessing pipelines
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=300))
])

numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

# Combine text and numeric features
preprocessor = ColumnTransformer([
    ("text", text_pipeline, "transaction_description"),
    ("num", numeric_pipeline, ["amount", "month", "day_of_week"])
])

# Final pipeline with XGBoost classifier (cleaned up)
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        eval_metric="mlogloss",
        random_state=42,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1
    ))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("\nðŸ“Š Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Example predictions
examples = pd.DataFrame({
    "transaction_description": ["Spotify", "Apollo Pharmacy", "IRCTC", "Swiggy"],
    "amount": [199.0, 450.0, 750.0, 300.0],
    "month": [10, 5, 12, 7],
    "day_of_week": [2, 1, 4, 6]
})

predicted = model_pipeline.predict(examples)
print("\nðŸ”® Example Predictions:\n")
for desc, label in zip(examples["transaction_description"], predicted):
    print(f"{desc} â†’ {label_encoder.inverse_transform([label])[0]}")


ðŸ“Š Classification Report:

               precision    recall  f1-score   support

       Dining       1.00      1.00      1.00        47
    Education       1.00      1.00      1.00        58
Entertainment       1.00      1.00      1.00        57
    Groceries       1.00      1.00      1.00        50
   Healthcare       1.00      1.00      1.00        57
       Others       1.00      1.00      1.00        63
     Shopping       1.00      1.00      1.00        58
       Travel       1.00      1.00      1.00        49
    Utilities       1.00      1.00      1.00        61

     accuracy                           1.00       500
    macro avg       1.00      1.00      1.00       500
 weighted avg       1.00      1.00      1.00       500


ðŸ”® Example Predictions:

Spotify â†’ Entertainment
Apollo Pharmacy â†’ Healthcare
IRCTC â†’ Travel
Swiggy â†’ Dining


In [29]:
import joblib

# Save the trained model pipeline to a file
joblib.dump(model_pipeline, "expense_classifier_pipeline.pkl")
print("âœ… Model saved as 'expense_classifier_pipeline.pkl'")

âœ… Model saved as 'expense_classifier_pipeline.pkl'


In [31]:
import os
print(os.listdir())

['.ipynb_checkpoints', 'expense_classifier_pipeline.pkl', 'Model Code.ipynb', 'synthetic_transaction_data.csv']


In [None]:
zip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [43]:
import streamlit as st
import pandas as pd
import joblib

# Load the trained model
model = joblib.load("expense_classifier_pipeline.pkl")

st.title("ðŸ’¸ Expense Category Predictor")

with st.form("predict_form"):
    transaction_description = st.text_input("Transaction Description", "Spotify")
    amount = st.number_input("Amount", min_value=0.0, value=199.0)
    month = st.selectbox("Month", list(range(1, 13)), index=9)
    day_of_week = st.selectbox("Day of Week (0=Mon, 6=Sun)", list(range(7)), index=2)
    submitted = st.form_submit_button("Predict")

if submitted:
    input_df = pd.DataFrame([{
        "transaction_description": transaction_description,
        "amount": amount,
        "month": month,
        "day_of_week": day_of_week
    }])
    prediction = model.predict(input_df)
    category = model.named_steps["classifier"].classes_[prediction[0]]
    st.success(f"ðŸ§¾ Predicted Category: **{category}**")