In [1]:
from langchain_community.llms import Ollama

In [2]:
llm = Ollama(model="llama2")

  llm = Ollama(model="llama2")


### Read transaction data

In [3]:
# Read the transactions_debit_card_2024.csv file
import pandas as pd
pd.set_option('display.max_rows', None)
df = pd.read_csv("../data/transactions_2024_no_categorized.csv")
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (Argentinian Peso)
0,19/01/24,TRANSF. CLIENTE,Income,2000.0
1,19/01/24,TRANSFERENCIA INMEDIATA,Income,980.0
2,22/01/24,IMP PAIS SD,Expense,130.49
3,22/01/24,APPLE.COM/BILL,Expense,1731.3
4,22/01/24,RG 4815/20,Expense,489.36
5,23/01/24,IVA SERV DIGIT-RG AFIP 4240,Expense,363.57
6,28/02/24,TRANSF. CLIENTE,Income,30000.0
7,29/02/24,OPENPAY*VIDA POINT,Expense,2900.0
8,29/02/24,DIA TIENDA 268,Expense,3518.75
9,29/02/24,FEI LI,Expense,3000.0


In [4]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

28

In [5]:
unique_transactions[1:10]

array(['TRANSFERENCIA INMEDIATA', 'IMP PAIS SD', 'APPLE.COM/BILL',
       'RG 4815/20', 'IVA SERV DIGIT-RG AFIP 4240', 'OPENPAY*VIDA POINT',
       'DIA TIENDA 268', 'FEI LI', 'MERPAGO*HAKUNA'], dtype=object)

### Categorise bank transactions with Llama2

In [6]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 28]

In [7]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: TRANSF. CLIENTE - Bank Transfer , DIA TIENDA 268 - Groceries, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [8]:
# Test out the function
categorize_transactions('TRANSF. CLIENTE, APPLE.COM/BILL,IVA SERV DIGIT-RG AFIP 4240,DIA TIENDA 268',
                        llm)

['Of course! Here are the appropriate categories for each expense you provided:', '', '1. TRANSF. CLIENTE - Client Transfer', '2. APPLE.COM/BILL - Apple Subscription (Bill)', '3. IVA SERV DIGIT-RG AFIP 4240 - Italian VAT Service (Digital Receipt)', '4. DIA TIENDA 268 - Groceries']


Unnamed: 0,Transaction vs category,Transaction,Category
0,Of course! Here are the appropriate categories...,Of course! Here are the appropriate categories...,
1,,,
2,1. TRANSF. CLIENTE - Client Transfer,1. TRANSF. CLIENTE,Client Transfer
3,2. APPLE.COM/BILL - Apple Subscription (Bill),2. APPLE.COM/BILL,Apple Subscription (Bill)
4,3. IVA SERV DIGIT-RG AFIP 4240 - Italian VAT S...,3. IVA SERV DIGIT-RG AFIP 4240,Italian VAT Service (Digital Receipt)
5,4. DIA TIENDA 268 - Groceries,4. DIA TIENDA 268,Groceries


In [9]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

['Here are the expenses with appropriate categories:', '', '1. TRANSF. CLIENTE - Bank Transfer', '2. TRANSFERENCIA INMEDIATA - Immediate Transaction', '3. IMP PAIS SD - Important Payment for Software Subscription', '4. APPLE.COM/BILL - Apple Bill', '5. RG 4815/20 - Utility Bills', '6. IVA SERV DIGIT-RG AFIP 4240 - Italian VAT Services', '7. OPENPAY*VIDA POINT - Openpay Point of Sale', '8. DIA TIENDA 268 - Groceries', '9. FEI LI - Fei Li', '10. MERPAGO*HAKUNA - Merpage Transaction', '11. INTERESES GANADOS - Investment Interests', '12. JULIO CESAR ESTRADA - Julio Cesar Estravia', '13. DIA TIENDA 679 - Groceries', '14. COTO SUCURSAL 163 - Local Store', '15. PVS*SUPER URUBURU JOSE E - Supermarket Jose E', '16. MERPAGO*JUANLIN - Merpage Transaction', "17. VERDULERIA PAOLA - Paola's Organic Market", '18. FARMACITY - Pharmacy', '19. PIGMENTO - Pigment', '20. TRANSF. CLIENTE TEJERINA MARIA - Tejerina Maria Transfer', '21. MERPAGO*LANATURALEZA1855 - Merpage Transaction', '22. Market Avenida Sta

In [10]:
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,Here are the expenses with appropriate categor...,Here are the expenses with appropriate categor...,
1,,,
2,1. TRANSF. CLIENTE - Bank Transfer,1. TRANSF. CLIENTE,Bank Transfer
3,2. TRANSFERENCIA INMEDIATA - Immediate Transac...,2. TRANSFERENCIA INMEDIATA,Immediate Transaction
4,3. IMP PAIS SD - Important Payment for Softwar...,3. IMP PAIS SD,Important Payment for Software Subscription
5,4. APPLE.COM/BILL - Apple Bill,4. APPLE.COM/BILL,Apple Bill
6,5. RG 4815/20 - Utility Bills,5. RG 4815/20,Utility Bills
7,6. IVA SERV DIGIT-RG AFIP 4240 - Italian VAT S...,6. IVA SERV DIGIT-RG AFIP 4240,Italian VAT Services
8,7. OPENPAY*VIDA POINT - Openpay Point of Sale,7. OPENPAY*VIDA POINT,Openpay Point of Sale
9,8. DIA TIENDA 268 - Groceries,8. DIA TIENDA 268,Groceries


In [11]:
categories_df_all.to_csv("categories_df_all.csv", index=False)

In [12]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([None, 'Bank Transfer', 'Immediate Transaction',
       'Important Payment for Software Subscription', 'Apple Bill',
       'Utility Bills', 'Italian VAT Services', 'Openpay Point of Sale',
       'Groceries', 'Fei Li', 'Merpage Transaction',
       'Investment Interests', 'Julio Cesar Estravia', 'Local Store',
       'Supermarket Jose E', "Paola's Organic Market", 'Pharmacy',
       'Pigment', 'Tejerina Maria Transfer', 'Local Market', 'New Garden',
       'La Finca', 'Transfer', 'Complete Your First Mortgage',
       'Any Mortgage Investment'], dtype=object)

In [13]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# First split the columns
categories_df_all[['Transaction', 'Category']] = categories_df_all['Transaction vs category'].str.split(' - ', expand=True)

# Clean up the Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.strip() 
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'^\d+\.\s*', '', regex=True)

# Standardize basic categories
categories_df_all.loc[categories_df_all['Category'].str.contains("Transfer|BANKING", case=False), 'Category'] = "Banking"
categories_df_all.loc[categories_df_all['Category'].str.contains("Groceries|Supermarket", case=False), 'Category'] = "Groceries"
categories_df_all.loc[categories_df_all['Category'].str.contains("Insurance", case=False), 'Category'] = "Insurance"
categories_df_all.loc[categories_df_all['Category'].str.contains("Merchant|Services", case=False), 'Category'] = "Services"
categories_df_all.loc[categories_df_all['Category'].str.contains("Rent|Mortgage", case=False), 'Category'] = "Housing"
categories_df_all.loc[categories_df_all['Category'].str.contains("Utilities", case=False), 'Category'] = "Utilities"
categories_df_all.loc[categories_df_all['Category'].str.contains("Pharmacy", case=False), 'Category'] = "Healthcare"
categories_df_all.loc[categories_df_all['Category'].str.contains("Home Improvement", case=False), 'Category'] = "Home Improvement"
categories_df_all.loc[categories_df_all['Category'].str.contains("Online Shopping|Shopping", case=False), 'Category'] = "Shopping"
categories_df_all.loc[categories_df_all['Category'].str.contains("Taxes", case=False), 'Category'] = "Taxes"
categories_df_all.loc[categories_df_all['Category'].str.contains("Miscellaneous", case=False), 'Category'] = "Miscellaneous"

categories_df_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all[['Transaction', 'Category']] = categories_df_all['Transaction vs category'].str.split(' - ', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all['Transaction'] = categories_df_all['Transaction'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c

Unnamed: 0,Transaction vs category,Transaction,Category
2,1. TRANSF. CLIENTE - Bank Transfer,TRANSF. CLIENTE,Banking
3,2. TRANSFERENCIA INMEDIATA - Immediate Transac...,TRANSFERENCIA INMEDIATA,Immediate Transaction
4,3. IMP PAIS SD - Important Payment for Softwar...,IMP PAIS SD,Important Payment for Software Subscription
5,4. APPLE.COM/BILL - Apple Bill,APPLE.COM/BILL,Apple Bill
6,5. RG 4815/20 - Utility Bills,RG 4815/20,Utility Bills
7,6. IVA SERV DIGIT-RG AFIP 4240 - Italian VAT S...,IVA SERV DIGIT-RG AFIP 4240,Services
8,7. OPENPAY*VIDA POINT - Openpay Point of Sale,OPENPAY*VIDA POINT,Openpay Point of Sale
9,8. DIA TIENDA 268 - Groceries,DIA TIENDA 268,Groceries
10,9. FEI LI - Fei Li,FEI LI,Fei Li
11,10. MERPAGO*HAKUNA - Merpage Transaction,MERPAGO*HAKUNA,Merpage Transaction


In [14]:
# Merge the categories_df_all with the transactions dataframe
df = pd.read_csv("../data/transactions_2024_no_categorized.csv")
df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')

# Drop unnecessary columns after merge
df = df.drop(['Transaction vs category', 'Transaction'], axis=1)

# Standardize categories for NaN values based on transaction descriptions
mask = df['Category'].isna()
df.loc[mask & df['Name / Description'].str.contains('TRANSF|BANKING', case=False, na=False), 'Category'] = 'Banking'
df.loc[mask & df['Name / Description'].str.contains('SUPER|MERCADO|VERDULERIA|COTO|DIA|CARREFOUR', case=False, na=False), 'Category'] = 'Groceries'
df.loc[mask & df['Name / Description'].str.contains('SEGURO', case=False, na=False), 'Category'] = 'Insurance'
df.loc[mask & df['Name / Description'].str.contains('SERVICIOS|MERCHANT', case=False, na=False), 'Category'] = 'Services'
df.loc[mask & df['Name / Description'].str.contains('ALQUILER|RENTA', case=False, na=False), 'Category'] = 'Housing'
df.loc[mask & df['Name / Description'].str.contains('LUZ|GAS|AGUA|SERVICIOS', case=False, na=False), 'Category'] = 'Utilities'
df.loc[mask & df['Name / Description'].str.contains('FARMACIA|SALUD', case=False, na=False), 'Category'] = 'Healthcare'
df.loc[mask & df['Name / Description'].str.contains('EASY|SODIMAC', case=False, na=False), 'Category'] = 'Home Improvement'
df.loc[mask & df['Name / Description'].str.contains('SHOPPING|COMPRA|MERCADOLIBRE|AMAZON', case=False, na=False), 'Category'] = 'Shopping'
df.loc[mask & df['Name / Description'].str.contains('IMPUESTO|IMP|AFIP|ARBA', case=False, na=False), 'Category'] = 'Taxes'

# Fill remaining NaN with Miscellaneous
df['Category'] = df['Category'].fillna('Miscellaneous')
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (Argentinian Peso),Category
0,19/01/24,TRANSF. CLIENTE,Income,2000.0,Banking
1,19/01/24,TRANSFERENCIA INMEDIATA,Income,980.0,Immediate Transaction
2,22/01/24,IMP PAIS SD,Expense,130.49,Important Payment for Software Subscription
3,22/01/24,APPLE.COM/BILL,Expense,1731.3,Apple Bill
4,22/01/24,RG 4815/20,Expense,489.36,Utility Bills
5,23/01/24,IVA SERV DIGIT-RG AFIP 4240,Expense,363.57,Services
6,28/02/24,TRANSF. CLIENTE,Income,30000.0,Banking
7,29/02/24,OPENPAY*VIDA POINT,Expense,2900.0,Openpay Point of Sale
8,29/02/24,DIA TIENDA 268,Expense,3518.75,Groceries
9,29/02/24,FEI LI,Expense,3000.0,Fei Li


In [15]:
df.to_csv("../data/transactions_2024_categorized.csv", index=False)