In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sqlite3
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
def ingest_csv(file_path, llm, db_path="money_rag.db"):
    df = pd.read_csv(file_path)
    headers = df.columns.tolist()
    # Taking 10 rows gives the LLM a better chance to see both a purchase and a payment
    sample_data = df.head(10).to_json() 

    prompt = ChatPromptTemplate.from_template("""
    Act as a financial data parser. Analyze this CSV data:
    Filename: {filename}
    Headers: {headers}
    Sample Data: {sample}

    TASK:
    1. Map the CSV columns to standard fields: date, description, amount, and category.
    2. Determine the 'sign_convention' by looking at the filename and the data:
       
       RULES:
       - If the filename suggests 'Discover' credit card, spending are usually POSITIVE.
       - If the filename suggests 'Chase' credit card, spending are usually NEGATIVE.
                                              
       - Analyze the 'sign_convention' for spending (outflows):
          - Look at the sample data for known merchants or spending patterns.
          - If spending (like a restaurant or store) is NEGATIVE (e.g., -25.00), the convention is 'spending_is_negative'.
          - If spending is POSITIVE (e.g., 25.00), the convention is 'spending_is_positive'.
                                              

    OUTPUT FORMAT (JSON ONLY):
    {{
      "date_col": "column_name",
      "desc_col": "column_name",
      "amount_col": "column_name",
      "category_col": "column_name or null",
      "sign_convention": "spending_is_negative" | "spending_is_positive"
    }}
    """)
    
    chain = prompt | llm | JsonOutputParser()
    mapping = chain.invoke({"headers": headers, "sample": sample_data, "filename": file_path})

    # Standardizing Data
    standard_df = pd.DataFrame()
    standard_df['transaction_date'] = pd.to_datetime(df[mapping['date_col']])
    standard_df['description'] = df[mapping['desc_col']]
    
    # --- The Normalization Logic ---
    # Goal: All spending (outflow) = POSITIVE, All payments (inflow) = NEGATIVE
    raw_amounts = pd.to_numeric(df[mapping['amount_col']])
    
    if mapping['sign_convention'] == "spending_is_negative":
        # If the bank shows spending as -100 and payments as +100, 
        # we flip everything so spending is +100 and payments are -100.
        standard_df['amount'] = raw_amounts * -1
    else:
        # If the bank already shows spending as +100 and payments as -100, keep it.
        standard_df['amount'] = raw_amounts
    
    standard_df['category'] = df[mapping['category_col']] if mapping.get('category_col') else 'Uncategorized'
    standard_df['source_file'] = file_path.split("/")[-1]

    # Save to DB
    conn = sqlite3.connect(db_path)
    standard_df.to_sql("transactions", conn, if_exists="append", index=False)
    conn.close()
    
    print(f"✅ Ingested {file_path}. Logic: {mapping['sign_convention']}")

In [4]:
path1 = "/Users/sawale/Documents/learning/money_rag/demo_data/Discover-AllAvailable-20260110.csv"
path2 = "/Users/sawale/Documents/learning/money_rag/demo_data/Chase5282_Activity20240110_20260110_20260111.CSV"

# Initialize the Gemini model via Vertex AI
llm = init_chat_model(
    "gemini-2.5-flash", 
    model_provider="google_vertexai",
    project='gen-lang-client-0311515393',
    location='us-central1',
)


ingest_csv(path1, llm)
ingest_csv(path2, llm)


  from google.cloud.aiplatform.utils import gcs_utils


✅ Ingested /Users/sawale/Documents/learning/money_rag/demo_data/Discover-AllAvailable-20260110.csv. Logic: spending_is_positive
✅ Ingested /Users/sawale/Documents/learning/money_rag/demo_data/Chase5282_Activity20240110_20260110_20260111.CSV. Logic: spending_is_negative


In [1]:
# ...existing code...
import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect("money_rag.db")

# Query all transactions
df_view = pd.read_sql_query("SELECT * FROM transactions", conn)

# Close connection
conn.close()

# Display the data
df_view

Unnamed: 0,transaction_date,description,amount,category,source_file
0,2024-10-17 00:00:00,BACK MARKET BROOKLYN NY,231.19,Merchandise,Discover-AllAvailable-20260110.csv
1,2024-10-18 00:00:00,TEMU.COM 8884958368 DE,16.51,Merchandise,Discover-AllAvailable-20260110.csv
2,2024-10-18 00:00:00,WALMART STORE 00332 HUNTSVILLE AL,146.73,Merchandise,Discover-AllAvailable-20260110.csv
3,2024-10-18 00:00:00,$100 STATEMENT CREDIT W 1ST PU,-100.00,Awards and Rebate Credits,Discover-AllAvailable-20260110.csv
4,2024-11-02 00:00:00,PY *KUNG-FU TEA AL HUNTSVILLE AL,8.09,Restaurants,Discover-AllAvailable-20260110.csv
...,...,...,...,...,...
245,2025-06-18 00:00:00,PANDA EXPRESS #2005,52.87,Food & Drink,Chase5282_Activity20240110_20260110_20260111.CSV
246,2025-06-14 00:00:00,Payment Thank You-Mobile,-62.07,,Chase5282_Activity20240110_20260110_20260111.CSV
247,2025-06-12 00:00:00,STARS AND STRIKES - HUNTS,21.80,Entertainment,Chase5282_Activity20240110_20260110_20260111.CSV
248,2025-06-11 00:00:00,WAL-MART #332,4.47,Groceries,Chase5282_Activity20240110_20260110_20260111.CSV


In [2]:
df_view["amount"].sum()

np.float64(57.089999999999804)