In [1]:
 #!pip3 install langchain_community

In [2]:
from langchain_community.llms import Ollama

In [3]:
llm = Ollama(model="llama2")
llm.invoke("The first man on the moon was ...")

  llm = Ollama(model="llama2")


'\nThe first man on the moon was Neil Armstrong, who landed on the moon on July 20, 1969 as part of the Apollo 11 mission. Armstrong stepped out of the lunar module Eagle and onto the moon\'s surface at 2:56 UTC on July 20, 1969, famously declaring "That\'s one small step for man, one giant leap for mankind" as he took his first steps. He was followed by fellow astronaut Edwin "Buzz" Aldrin, who also walked on the moon during the mission.'

In [4]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
Taxi Utrecht, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'\nCertainly! Here are the expenses you provided, categorized appropriately:\n\nTaxi Utrecht - Transportation\nMinisterie van Justitie en Veiligheid - Government\nEtos AMSTERDAM NLD - Groceries\nBistro Bar Amsterdam - Food and Beverage'

### Read transaction data

In [None]:
# Read tge transactions_2022_2023.csv file
import pandas as pd
import re
df = pd.read_csv("transactions.csv") # Or transactions_2022_2023.csv
df.head()

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR)
0,2023-12-30,Belastingdienst,Expense,9.96
1,2023-12-30,Tesco Breda,Expense,17.53
2,2023-12-30,Monthly Appartment Rent,Expense,451.0
3,2023-12-30,Vishandel Sier Amsterdam,Expense,12.46
4,2023-12-29,Selling Paintings,Income,13.63


In [6]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

23

In [7]:
unique_transactions[1:10]

array(['Tesco Breda', 'Monthly Appartment Rent',
       'Vishandel Sier Amsterdam', 'Selling Paintings',
       'Spotify Ab By Adyen', 'Tk Maxx Amsterdam Da', 'Consulting',
       'Aidsfonds', 'Tls Bv Inz Ov-Chipkaart'], dtype=object)

### Categorise bank transactions with Llama2

In [8]:
# Get index list
##https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 23]

In [9]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Spotify AB by Adyen - Entertainment, Beta Boulders Ams Amsterdam Nld - Sport, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)

    return categories_df

In [10]:
# Test out the function
categorize_transactions('ISS Catering Services De Meern, Taxi Utrecht, Etos AMSTERDAM NLD, Bistro Bar Amsterdam',
                        llm)

['', 'Of course! Here are the appropriate categories for each expense:', '', '1. Spotify AB by Adyen - Entertainment', '2. Beta Boulders Ams Amsterdam Nld - Sport', '3. ISS Catering Services De Meern - Food and Beverage', '4. Taxi Utrecht - Transportation', '5. Etos AMSTERDAM NLD - Grocery', '6. Bistro Bar Amsterdam - Food and Beverage']


Unnamed: 0,Transaction vs category,Transaction,Category
0,,,
1,Of course! Here are the appropriate categories...,Of course! Here are the appropriate categories...,
2,,,
3,1. Spotify AB by Adyen - Entertainment,1. Spotify AB by Adyen,Entertainment
4,2. Beta Boulders Ams Amsterdam Nld - Sport,2. Beta Boulders Ams Amsterdam Nld,Sport
5,3. ISS Catering Services De Meern - Food and B...,3. ISS Catering Services De Meern,Food and Beverage
6,4. Taxi Utrecht - Transportation,4. Taxi Utrecht,Transportation
7,5. Etos AMSTERDAM NLD - Grocery,5. Etos AMSTERDAM NLD,Grocery
8,6. Bistro Bar Amsterdam - Food and Beverage,6. Bistro Bar Amsterdam,Food and Beverage


In [12]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

['Of course! Here are the categories I came up with for each expense:', '', '1. Belastingdienst - Taxes', '2. Tesco Breda - Groceries', '3. Monthly Appartment Rent - Housing', '4. Vishandel Sier Amsterdam - Food', '5. Selling Paintings - Art', '6. Spotify AB by Adyen - Entertainment', '7. Tk Maxx Amsterdam Da - Shopping', '8. Consulting - Professional Services', '9. Aidsfonds - Charity', '10. TLS BV Inz Ov-Chipkaart - Transportation', '11. Etos Amsterdam - Health and Wellness', '12. Beta Boulders Ams Amsterdam - Fitness', '13. Salary - Personal Expenses', '14. Bouldermuur BV Amsterdam - Business', '15. Birtat Restaurant Amsterdam - Dining Out', '16. Freelancing - Work-Related Expenses', '17. Tikkie - Online Services', '18. Blogging - Personal Interest', '19. Taxi Utrecht - Transportation', '20. Apple Services - Tech and Gadgets', '21. Amazon Lux - Online Shopping', '22. Classpass* Monthly - Fitness', '', 'Note that some of these categories may overlap with each other, but I tried to ke

In [13]:
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,Of course! Here are the categories I came up w...,Of course! Here are the categories I came up w...,
1,,,
2,1. Belastingdienst - Taxes,1. Belastingdienst,Taxes
3,2. Tesco Breda - Groceries,2. Tesco Breda,Groceries
4,3. Monthly Appartment Rent - Housing,3. Monthly Appartment Rent,Housing
5,4. Vishandel Sier Amsterdam - Food,4. Vishandel Sier Amsterdam,Food
6,5. Selling Paintings - Art,5. Selling Paintings,Art
7,6. Spotify AB by Adyen - Entertainment,6. Spotify AB by Adyen,Entertainment
8,7. Tk Maxx Amsterdam Da - Shopping,7. Tk Maxx Amsterdam Da,Shopping
9,8. Consulting - Professional Services,8. Consulting,Professional Services


In [14]:
categories_df_all.to_csv("categories_df_all.csv", index=False)

In [15]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([None, 'Taxes', 'Groceries', 'Housing', 'Food', 'Art',
       'Entertainment', 'Shopping', 'Professional Services', 'Charity',
       'Transportation', 'Health and Wellness', 'Fitness',
       'Personal Expenses', 'Business', 'Dining Out',
       'Work-Related Expenses', 'Online Services', 'Personal Interest',
       'Tech and Gadgets', 'Online Shopping'], dtype=object)

In [16]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# If category contains "Food", then categories as "Food and Drinks"
categories_df_all.loc[categories_df_all['Category'].str.contains('Food'), 'Category'] = "Food and Drinks"
# If category contains "Clothing", then categories as "Clothing"
categories_df_all.loc[categories_df_all['Category'].str.contains('Clothing'), 'Category'] = "Clothing"
# If category contains "Services", then categories as "Services"
categories_df_all.loc[categories_df_all['Category'].str.contains('Services'), 'Category'] = "Services"
# If category contains "Health" or "Wellness", then categories as "Health and Wellness"
categories_df_all.loc[categories_df_all['Category'].str.contains('Health|Wellness'), 'Category'] = "Health and Wellness"
# If category contains "Sport", then categories as "Sport and Fitness"
categories_df_all.loc[categories_df_all['Category'].str.contains('Sport'), 'Category'] = "Sport and Fitness"
# If category contains "Travel", then categories as "Travel"
categories_df_all.loc[categories_df_all['Category'].str.contains('Travel'), 'Category'] = "Travel"


In [23]:

# Clean the 'Name / Description' column in the transactions DataFrame
df['Name / Description'] = df['Name / Description'].astype(str).str.strip().str.lower()
df.loc[df['Name / Description'].str.contains("spotify"), 'Name / Description'] = "spotify ab by adyen"


# 2. Load and preprocess categories_df_all.csv
# Load the categories CSV file
categories_df_raw = pd.read_csv("categories_df_all.csv")

# Filter rows that actually contain category data.
# We look for rows where the 'Transaction' column starts with a number and a period,
# OR where 'Category' is not NaN AND 'Transaction' is not the long introductory text.
categories_df_cleaned = categories_df_raw[
    categories_df_raw['Transaction'].astype(str).str.match(r'^\d+\.') |
    (~categories_df_raw['Category'].isna()) & (~categories_df_raw['Transaction'].astype(str).str.contains("Certainly!"))
].copy() # Avoid SettingWithCopyWarning

# Clean the 'Transaction' column to remove numbering (if still present)
categories_df_cleaned['Transaction'] = categories_df_cleaned['Transaction'].astype(str).str.replace(r'^\d+\.\s*', '', regex=True)
categories_df_cleaned['Transaction vs category'] = categories_df_cleaned['Transaction vs category'].astype(str).str.replace(r'^\d+\.\s*', '', regex=True)

# Clean spaces and convert to lowercase for both merge columns
categories_df_cleaned['Transaction'] = categories_df_cleaned['Transaction'].astype(str).str.strip().str.lower()
categories_df_cleaned['Category'] = categories_df_cleaned['Category'].astype(str).str.strip().str.lower()
categories_df_cleaned['Transaction vs category'] = categories_df_cleaned['Transaction vs category'].astype(str).str.strip()


# Ensure only necessary columns are present and have consistent names
# Select and rename columns for the merge
# Ensure that 'Transaction' is also included here, which is already implicit because it's in categories_df_cleaned
categories_df_final = categories_df_cleaned[['Transaction vs category', 'Transaction', 'Category']].copy()

print("First rows of categories_df_final after cleaning:")
print(categories_df_final.head(10))
print("\nUnique values in categories_df_final['Transaction'] after cleaning:")
print(categories_df_final['Transaction'].unique())


# 3. Perform the merge
# We use a left merge to keep all original transactions
df_merged = pd.merge(df, categories_df_final,
                     left_on='Name / Description',
                     right_on='Transaction',
                     how='left')

# If the 'Transaction' column from the merge is not needed, you can drop it.
# As it's the same as 'Name / Description' if there was a match.
# COMMENT OUT OR REMOVE THIS LINE to keep the 'Transaction' column
# df_merged = df_merged.drop(columns=['Transaction'])


print("\nFinal DataFrame after merge:")
print(df_merged.head(20))

print("\nTransactions with NaN in the category after merge (that could not be categorized):")
print(df_merged[df_merged['Category'].isna()]['Name / Description'].unique())

First rows of categories_df_final after cleaning:
                     Transaction vs category               Transaction  \
2                    Belastingdienst - Taxes           belastingdienst   
3                    Tesco Breda - Groceries               tesco breda   
4          Monthly Appartment Rent - Housing   monthly appartment rent   
5            Vishandel Sier Amsterdam - Food  vishandel sier amsterdam   
6                    Selling Paintings - Art         selling paintings   
7        Spotify AB by Adyen - Entertainment       spotify ab by adyen   
8            Tk Maxx Amsterdam Da - Shopping      tk maxx amsterdam da   
9         Consulting - Professional Services                consulting   
10                       Aidsfonds - Charity                 aidsfonds   
11  TLS BV Inz Ov-Chipkaart - Transportation   tls bv inz ov-chipkaart   

                 Category  
2                   taxes  
3               groceries  
4                 housing  
5                    fo

In [24]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
#df = pd.read_csv("transactions_2022_2023.csv")
#df.loc[df['Name / Description'].str.contains("Spotify"), 'Name / Description'] = "Spotify Ab By Adyen"
#df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')
df_merged


Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR),Transaction vs category,Transaction,Category
0,2023-12-30,belastingdienst,Expense,9.96,Belastingdienst - Taxes,belastingdienst,taxes
1,2023-12-30,tesco breda,Expense,17.53,Tesco Breda - Groceries,tesco breda,groceries
2,2023-12-30,monthly appartment rent,Expense,451.0,Monthly Appartment Rent - Housing,monthly appartment rent,housing
3,2023-12-30,vishandel sier amsterdam,Expense,12.46,Vishandel Sier Amsterdam - Food,vishandel sier amsterdam,food
4,2023-12-29,selling paintings,Income,13.63,Selling Paintings - Art,selling paintings,art
5,2023-12-29,spotify ab by adyen,Expense,12.19,Spotify AB by Adyen - Entertainment,spotify ab by adyen,entertainment
6,2023-12-23,tk maxx amsterdam da,Expense,27.08,Tk Maxx Amsterdam Da - Shopping,tk maxx amsterdam da,shopping
7,2023-12-22,consulting,Income,541.57,Consulting - Professional Services,consulting,professional services
8,2023-12-22,aidsfonds,Expense,10.7,Aidsfonds - Charity,aidsfonds,charity
9,2023-12-20,consulting,Income,2641.93,Consulting - Professional Services,consulting,professional services


In [25]:
df_merged.to_csv("transactions_2022_2023_categorized.csv", index=False)