In [1]:
import pandas as pd
import re # Using re for basic text cleaning

print("--- Transaction Categorizer ---")

--- Transaction Categorizer ---


In [3]:
# 1. Define our category-to-keyword mapping
# This is the "brain" of our rule-based categorizer.
CATEGORY_KEYWORDS = {
    "Groceries": [
        "WALMART", "KROGER", "SAFEWAY", "PUBLIX", "COSTCO", "SUPERCENTER", 
        "GROCERY", "SUPERMARKET"
    ],
    "Gas/Automotive": [
        "SHELL", "EXXON", "MOBIL", "BP", "CHEVRON", "76", "GAS", "AUTO", "JIFFY LUBE"
    ],
    "Restaurants/Dining": [
        "MCDONALD'S", "STARBUCKS", "SUBWAY", "CAFE", "RESTAURANT", "DINER", 
        "PIZZA", "COFFEE"
    ],
    "Utilities": [
        "COMCAST", "VERIZON", "AT&T", "T-MOBILE", "ELECTRIC", "WATER", "GAS", "UTILITY"
    ],
    "Subscriptions/Entertainment": [
        "NETFLIX", "SPOTIFY", "HULU", "DISNEY+", "AMAZON PRIME", "AMC", "THEATRE"
    ],
    "Shopping/General": [
        "AMAZON", "TARGET", "BEST BUY", "HOME DEPOT", "LOWE'S", "AMZ"
    ],
    "Travel/Transport": [
        "UBER", "LYFT", "AMERICAN", "DELTA", "AIRLINES", "MARRIOTT", "HYATT", "HOTEL"
    ],
    "Health/Wellness": [
        "CVS", "WALGREENS", "PHARMACY", "FITNESS", "GYM"
    ],
}

In [4]:
def categorize_transaction(description):
    """
    Categorizes a transaction based on keywords in its description.
    
    Args:
        description (str): The raw transaction description.
        
    Returns:
        str: The assigned category.
    """
    if not isinstance(description, str):
        return "Miscellaneous"

    # Clean and uppercase the description for easier matching
    # 1. Remove non-alphabetic characters (like #, 1234, etc.)
    # 2. Convert to uppercase
    # 3. Split into individual words (tokens)
    cleaned_desc = re.sub(r'[^a-zA-Z\s]', '', description).upper()
    tokens = set(cleaned_desc.split()) # Use a set for fast lookups
    
    # Iterate through our keyword mapping
    for category, keywords in CATEGORY_KEYWORDS.items():
        # Check for intersection between keywords and tokens
        if any(keyword in tokens for keyword in keywords):
            return category
            
    # If no match is found, assign a default category
    return "Miscellaneous"

print("Categorizer function defined.")

Categorizer function defined.


In [5]:
# --- Test the Function ---

print("--- Testing Categorizer ---")
# Let's create a list of example transactions to test
test_transactions = [
    "WALMART SUPERCENTER #1234",
    "SHELL 05/21 8765",
    "STARBUCKS 456",
    "AMZ*Prime Subscription",
    "VERIZON WIRELESS BILL",
    "UBER TRIP 05/22",
    "Random Corner Cafe",
    "SOME BIZ 001",
    "CVS PHARMACY #9876",
    "LOWE'S"
]

# Create a DataFrame to show the results
test_df = pd.DataFrame(test_transactions, columns=['Description'])
test_df['Category'] = test_df['Description'].apply(categorize_transaction)

print(test_df)

--- Testing Categorizer ---
                 Description            Category
0  WALMART SUPERCENTER #1234           Groceries
1           SHELL 05/21 8765      Gas/Automotive
2              STARBUCKS 456  Restaurants/Dining
3     AMZ*Prime Subscription       Miscellaneous
4      VERIZON WIRELESS BILL           Utilities
5            UBER TRIP 05/22    Travel/Transport
6         Random Corner Cafe  Restaurants/Dining
7               SOME BIZ 001       Miscellaneous
8         CVS PHARMACY #9876     Health/Wellness
9                     LOWE'S       Miscellaneous
