In [75]:
import pandas as pd

df = pd.read_csv("/content/transaction_data.csv")
df.head()

Unnamed: 0,description,amount,category
0,UBER,25.5,Travel
1,UBER TRIP,35.2,Travel
2,UBER RIDE,22.3,Travel
3,BRITISH GAS,89.99,Utilities
4,BRITISH GAS - ELECT,75.5,Utilities


In [76]:
df.category.unique()

array(['Travel', 'Utilities', 'Payroll', 'Marketing', 'Office Supplies',
       'Software & IT', 'Insurance', 'Transportation', 'Food & Dining',
       'Office Supplies '], dtype=object)

Clustering

In [77]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

In [78]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['description'].tolist())

In [79]:
embeddings[:5]

array([[ 0.04065343,  0.04586978,  0.01049653, ...,  0.05226987,
         0.09943408, -0.03537086],
       [ 0.09144842,  0.0366412 ,  0.02692257, ...,  0.05127009,
         0.04457774, -0.07760168],
       [ 0.02320764,  0.0322987 ,  0.03156928, ...,  0.07888345,
         0.0909534 , -0.07793154],
       [-0.02557846,  0.01111243, -0.03563021, ...,  0.01006075,
        -0.03489269, -0.03021103],
       [-0.0482884 ,  0.02963824, -0.00977195, ..., -0.01235049,
        -0.02569318, -0.01543424]], dtype=float32)

In [80]:
clustering = DBSCAN(eps=0.5, min_samples=2, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [81]:
df.head()

Unnamed: 0,description,amount,category,cluster
0,UBER,25.5,Travel,0
1,UBER TRIP,35.2,Travel,0
2,UBER RIDE,22.3,Travel,0
3,BRITISH GAS,89.99,Utilities,1
4,BRITISH GAS - ELECT,75.5,Utilities,1


In [82]:
# Group by cluster to inspect patterns
clusters = df.groupby('cluster')['description'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)

In [83]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 3:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster -1:
  OCTOPUS ENERGY
  WATER PLUS LTD
  VODAFONE BILL
  HMRC PAYE
  MAILCHIMP
Cluster 1:
  BRITISH GAS
  BRITISH GAS - ELECT
  BRGAS HOME INSUR
  BRGAS GAS BILL
  BRITISH GAS SMART HOME
Cluster 4:
  AMZN UK
  AMZN UK PRIME
  AMZN MKTPLACE
  AMZN UK OFFICE
  AMZN UK MKTP B2B OFFICE
Cluster 0:
  UBER
  UBER TRIP
  UBER RIDE
  UBER BUSINESS TRAVEL
Cluster 3:
  GOOGLE ADS
  FACEBOOK ADS
  LINKEDIN ADS
  FB ENTERPRISE ADS


In [84]:
print("Cluster Sizes:")
for cluster_id, messages in sorted_clusters.items():
    print(f"Cluster {cluster_id}: {len(messages)} messages")

Cluster Sizes:
Cluster -1: 34 messages
Cluster 1: 5 messages
Cluster 4: 5 messages
Cluster 0: 4 messages
Cluster 3: 4 messages
Cluster 5: 3 messages
Cluster 2: 2 messages
Cluster 6: 2 messages
Cluster 7: 2 messages
Cluster 8: 2 messages
Cluster 9: 2 messages
Cluster 10: 2 messages


In [85]:
import re
def classify_with_regex(description):
    regex_patterns = {
        r"UBER RIDE.*": ("Travel", 0.99),
        r"UBER.*": ("Travel", 0.99),
        r"BRITISH GAS.*": ("Utilities", 0.99)
    }

    for pattern, (label, confidence) in regex_patterns.items():
        if re.search(pattern, description):
            return {
                "category": label,
                "confidence": confidence,
                "needs_review": confidence < 0.7
            }

    return None


In [86]:
classify_with_regex("BRITISH GAS SMART HOME")

{'category': 'Utilities', 'confidence': 0.99, 'needs_review': False}

In [87]:
classify_with_regex("UBER TRIP")

{'category': 'Travel', 'confidence': 0.99, 'needs_review': False}

In [88]:
classify_with_regex("GOOGLE ADS")

In [89]:
# Apply regex classification
df['regex_label'] = df['description'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,description,amount,category,cluster,regex_label
0,UBER,25.5,Travel,0,"{'category': 'Travel', 'confidence': 0.99, 'ne..."
1,UBER TRIP,35.2,Travel,0,"{'category': 'Travel', 'confidence': 0.99, 'ne..."
2,UBER RIDE,22.3,Travel,0,"{'category': 'Travel', 'confidence': 0.99, 'ne..."
3,BRITISH GAS,89.99,Utilities,1,"{'category': 'Utilities', 'confidence': 0.99, ..."
4,BRITISH GAS - ELECT,75.5,Utilities,1,"{'category': 'Utilities', 'confidence': 0.99, ..."
42,UBER BUSINESS TRAVEL,175.3,Travel,0,"{'category': 'Travel', 'confidence': 0.99, 'ne..."
45,BRITISH GAS SMART HOME,150.0,Utilities,1,"{'category': 'Utilities', 'confidence': 0.99, ..."


In [90]:
df[df['regex_label'].isnull()].head(5)

Unnamed: 0,description,amount,category,cluster,regex_label
5,OCTOPUS ENERGY,65.25,Utilities,-1,
6,WATER PLUS LTD,45.0,Utilities,-1,
7,VODAFONE BILL,35.99,Utilities,-1,
8,PAYROLL TRANSFER,2500.0,Payroll,2,
9,SALARY PAYMENT,3000.0,Payroll,2,


Classification Stage 2

In [91]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(60, 5)

In [92]:
df_Payroll = df_non_regex[df_non_regex.category=="Payroll"]
df_Payroll

Unnamed: 0,description,amount,category,cluster,regex_label
8,PAYROLL TRANSFER,2500.0,Payroll,2,
9,SALARY PAYMENT,3000.0,Payroll,2,
10,HMRC PAYE,580.25,Payroll,-1,


In [93]:
df_non_Payroll = df_non_regex[df_non_regex.category!="Payroll"]
df_non_Payroll

Unnamed: 0,description,amount,category,cluster,regex_label
5,OCTOPUS ENERGY,65.25,Utilities,-1,
6,WATER PLUS LTD,45.0,Utilities,-1,
7,VODAFONE BILL,35.99,Utilities,-1,
11,GOOGLE ADS,250.0,Marketing,3,
12,FACEBOOK ADS,180.0,Marketing,3,
13,LINKEDIN ADS,150.0,Marketing,3,
14,MAILCHIMP,29.99,Marketing,-1,
15,AMZN UK,45.99,Office Supplies,4,
16,AMZN UK PRIME,7.99,Software & IT,4,
17,AMZN MKTPLACE,129.99,Marketing,4,


In [94]:
df_non_Payroll.shape

(57, 5)

In [95]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_filtered = model.encode(df_non_Payroll['description'].tolist())

In [96]:
len(embeddings_filtered)

57

In [97]:
X = embeddings_filtered
y = df_non_Payroll['category'].values

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                 precision    recall  f1-score   support

  Food & Dining       0.00      0.00      0.00         1
      Marketing       0.00      0.00      0.00         4
Office Supplies       0.00      0.00      0.00         1
  Software & IT       0.44      1.00      0.62         8
 Transportation       0.00      0.00      0.00         1
         Travel       0.00      0.00      0.00         1
      Utilities       0.00      0.00      0.00         2

       accuracy                           0.44        18
      macro avg       0.06      0.14      0.09        18
   weighted avg       0.20      0.44      0.27        18



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training with duplicated dataset to have more data for better performance

In [99]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Clean and merge categories
df_non_payroll = df_non_regex[df_non_regex.category != "Payroll"].copy()
df_non_payroll['category'] = df_non_payroll['category'].replace("Transportation", "Travel")
df_non_payroll['category'] = df_non_payroll['category'].str.strip()

# Initial train-test split on original data
df_train_orig, df_test_orig = train_test_split(df_non_payroll, test_size=0.3, random_state=42)
print("Original Test Set Size:", len(df_test_orig))

# Duplicate training data 5x
df_train_5x = pd.concat([df_train_orig] * 5)
print("Class Distribution (Train 5x):")
print(df_train_5x['category'].value_counts())
print("Class Distribution (Test):")
print(df_test_orig['category'].value_counts())

# Load SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
X_train = model.encode(df_train_5x['description'].tolist())
y_train = df_train_5x['category'].values
X_test = model.encode(df_test_orig['description'].tolist())
y_test = df_test_orig['category'].values

# Model
clf = LogisticRegression(max_iter=1000, class_weight='balanced')  # Added class_weight
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=0)
print("\nClassification Report (5x Train, Original Test):")
print(report)



Original Test Set Size: 18
Class Distribution (Train 5x):
category
Software & IT      85
Office Supplies    25
Insurance          20
Travel             20
Marketing          20
Food & Dining      15
Utilities          10
Name: count, dtype: int64
Class Distribution (Test):
category
Software & IT      8
Marketing          4
Utilities          2
Travel             2
Office Supplies    1
Food & Dining      1
Name: count, dtype: int64

Classification Report (5x Train, Original Test):
                 precision    recall  f1-score   support

  Food & Dining       0.00      0.00      0.00         1
      Insurance       0.00      0.00      0.00         0
      Marketing       1.00      1.00      1.00         4
Office Supplies       0.50      1.00      0.67         1
  Software & IT       0.78      0.88      0.82         8
         Travel       0.50      0.50      0.50         2
      Utilities       0.00      0.00      0.00         2

       accuracy                           0.72        18


In [102]:
# Confidence scores
probs = clf.predict_proba(X_test)
predictions = []
for i, (pred, prob) in enumerate(zip(y_pred, probs)):
    confidence = max(prob)
    desc = df_test_orig['description'].iloc[i]
    predictions.append({
        "description": desc,
        "category": pred,
        "confidence": confidence,
        "needs_review": confidence < 0.7
    })

print("\nSample Predictions:")
for pred in predictions[:5]:
    print(pred)


Sample Predictions:
{'description': 'OCTOPUS ENERGY', 'category': 'Software & IT', 'confidence': np.float64(0.4243881734885938), 'needs_review': np.True_}
{'description': 'LINKEDIN ADS', 'category': 'Marketing', 'confidence': np.float64(0.2780038481332487), 'needs_review': np.True_}
{'description': 'CLOUDFLARE PRO DNS', 'category': 'Software & IT', 'confidence': np.float64(0.5348447911070454), 'needs_review': np.True_}
{'description': 'GOOGLE CLOUD', 'category': 'Software & IT', 'confidence': np.float64(0.6901897410267946), 'needs_review': np.True_}
{'description': 'GOOGLE CLOUDTRAINING', 'category': 'Software & IT', 'confidence': np.float64(0.7092293910549515), 'needs_review': np.False_}


In [104]:
import joblib
joblib.dump(clf, 'description_classifier.joblib')

['description_classifier.joblib']