In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re
import random

### **Data Generation**

In [None]:
categories = {
    'Food': ['SWIGGY', 'ZOMATO', 'DOMINOS', 'CAFE COFFEE DAY', 'STARBUCKS', 'RESTAURANT', 'PIZZA HUT'],
    'Travel': ['UBER', 'OLA', 'METRO CARD RECHARGE', 'INDIAN RAILWAYS', 'PETROL PUMP', 'FLIGHT TICKET'],
    'Groceries': ['GROFER', 'BIGBASKET', 'RELIANCE FRESH', 'SUPERMARKET', 'DAILY NEEDS'],
    'Shopping': ['AMAZON', 'FLIPKART', 'MYNTRA', 'SHOE STORE', 'ELECTRONICS'],
    'Utilities': ['ELECTRICITY BILL', 'MOBILE RECHARGE', 'BROADBAND BILL', 'WATER SUPPLY'],
    'Entertainment': ['NETFLIX', 'PVR CINEMAS', 'BOOKMYSHOW', 'SPOTIFY', 'GAME PASS']
}

data = []
for category, merchants in categories.items():
    for merchant in merchants:
        for _ in range(50): # generate 50 samples per merchant
            # generate a random amount between 50 and 5000
            amount = random.randint(50, 5000)
            # Create the transaction text
            transaction_text = f"TXN SUCCESSFUL FOR {amount} AT {merchant} VIA DEBIT CARD XXXX"
            data.append({'Transaction_Text': transaction_text, 'Category': category})

df = pd.DataFrame(data)

In [None]:
print("Generated Data Samples:")
print(df.head())
print("\nCategory Distribution:")
print(df['Category'].value_counts())

Generated Data Samples:
                                    Transaction_Text Category
0  TXN SUCCESSFUL FOR 4158 AT SWIGGY VIA DEBIT CA...     Food
1  TXN SUCCESSFUL FOR 4209 AT SWIGGY VIA DEBIT CA...     Food
2  TXN SUCCESSFUL FOR 4132 AT SWIGGY VIA DEBIT CA...     Food
3  TXN SUCCESSFUL FOR 3230 AT SWIGGY VIA DEBIT CA...     Food
4  TXN SUCCESSFUL FOR 4838 AT SWIGGY VIA DEBIT CA...     Food

Category Distribution:
Category
Food             350
Travel           300
Groceries        250
Shopping         250
Entertainment    250
Utilities        200
Name: count, dtype: int64


### **Data Preprocesing**

In [None]:
# Clean the text (e.g., removing 'TXN SUCCESSFUL FOR', 'VIA DEBIT CARD XXXX')
def clean_text(text):
    text = text.lower()
    text = re.sub(r'(txn successful for|via debit card xxxx|at|for|a/c|ref|debit|credit)', '', text)
    text = re.sub(r'\d+', ' ', text) # Remove numbers (amounts), as we want the model to rely on merchant names
    return text.strip()

df['Cleaned_Text'] = df['Transaction_Text'].apply(clean_text)

# Split the data
X = df['Cleaned_Text']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


### **TF-IDF Vectorizer**

In [None]:
# TF-IDF + Logistic Regression

# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nTF-IDF Matrix Shape: {X_train_tfidf.shape}")


TF-IDF Matrix Shape: (1280, 64)


In [None]:
# Model Training
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

print("\n TF-IDF + Logistic Regression Results : ")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))


 TF-IDF + Logistic Regression Results : 
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

Entertainment       1.00      1.00      1.00        50
         Food       1.00      1.00      1.00        70
    Groceries       1.00      1.00      1.00        50
     Shopping       1.00      1.00      1.00        50
       Travel       1.00      1.00      1.00        60
    Utilities       1.00      1.00      1.00        40

     accuracy                           1.00       320
    macro avg       1.00      1.00      1.00       320
 weighted avg       1.00      1.00      1.00       320



In [None]:
# Prediction Test
test_transaction = ["txn for 450 at PIZZA HUT", "txn for 2500 at FLIPKART"]
test_cleaned = [clean_text(t) for t in test_transaction]
test_tfidf = tfidf_vectorizer.transform(test_cleaned)
predictions = lr_model.predict(test_tfidf)

print("\nPrediction Test:")
for text, pred in zip(test_transaction, predictions):
    print(f"Text: '{text}' -> Predicted Category: {pred}")


Prediction Test:
Text: 'txn for 450 at PIZZA HUT' -> Predicted Category: Food
Text: 'txn for 2500 at FLIPKART' -> Predicted Category: Shopping
