In [1]:
!pip3 install langchain_community

Collecting langchain_community
  Using cached langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<2.0.0,>=0.3.75 (from langchain_community)
  Using cached langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain<2.0.0,>=0.3.27 (from langchain_community)
  Using cached langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Using cached sqlalchemy-2.0.43-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Using cached aiohttp-3.12.15-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain_community)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.10.1 (from langchain_communi

In [2]:
from langchain_community.llms import Ollama

In [3]:
llm = Ollama(model="llama2")
llm.invoke("The first man on the moon was ...")

  llm = Ollama(model="llama2")


'\nThe first man on the moon was Neil Armstrong. He stepped foot on the lunar surface on July 20, 1969, during the Apollo 11 mission. Armstrong famously declared, "That\'s one small step for man, one giant leap for mankind," as he became the first person to walk on the moon.'

In [4]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'\nOf course! Here are the categories for each of the expenses you provided:\n\nSpotify AB by Adyen - Entertainment\nBeta Boulders Ams Amsterdam Nld - Sports\nISS Catering Services De Meern - Food and Beverage\nVishandel Sier AMSTELVEEN - Food and Beverage\nMinisterie van Justitie en Veiligheid - Government\nEtos AMSTERDAM NLD - Retail\nBistro Bar Amsterdam - Food and Beverage'

# Read Transaction Data

In [22]:
# Read the transactions_2023_2024.csv file 
import pandas as pd
df = pd.read_csv("transactions_2023_2024.csv")
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (VND)
0,2023-01-05,Rent,Expense,6000000
1,2023-01-05,Electricity (EVN),Expense,500000
2,2023-01-05,Water,Expense,200000
3,2023-01-05,Internet (Viettel),Expense,250000
4,2023-01-05,Phone bill,Expense,150000
...,...,...,...,...
322,2024-12-01,Monthly Salary,Income,21765767
323,2024-12-20,Shopping (Shopee/Clothes),Expense,215527
324,2024-12-16,"Food & Drinks (restaurants, cafes)",Expense,204343
325,2024-12-13,Shopping (Shopee/Clothes),Expense,288558


In [9]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

18

In [18]:
unique_transactions[1:10]

array(['Electricity (EVN)', 'Water', 'Internet (Viettel)', 'Phone bill',
       'Spotify Premium', 'Netflix', 'Gym membership', 'English course',
       'Monthly Salary'], dtype=object)

# Categorise Bank Transaction With Llama2

In [19]:
# Get index list
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 18]

In [21]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Rent - Housing , Netflix - Entertainment, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [23]:

# Test out the function
categorize_transactions('Rent, Phone bill, Water, Monthly Salary',
                        llm)

['', 'Of course! Here are the categories for the expenses you provided:', '', '* Rent - Housing', '* Phone bill - Utilities', '* Water - Utilities', '* Monthly Salary - Income']


Unnamed: 0,Transaction vs category,Transaction,Category
0,,,
1,Of course! Here are the categories for the exp...,Of course! Here are the categories for the exp...,
2,,,
3,* Rent - Housing,* Rent,Housing
4,* Phone bill - Utilities,* Phone bill,Utilities
5,* Water - Utilities,* Water,Utilities
6,* Monthly Salary - Income,* Monthly Salary,Income


In [24]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)


['Of course! Here are the appropriate categories for each of the expenses you listed:', '', '1. Rent - Housing', '2. Electricity (EVN) - Utilities', '3. Water - Utilities', '4. Internet (Viettel) - Technology', '5. Phone bill - Communication', '6. Spotify Premium - Entertainment', '7. Netflix - Entertainment', '8. Gym membership - Fitness', '9. English course - Education', '10. Monthly Salary - Income', '11. Freelance Project - Work', '12. Transportation (Grab, Gas) - Transportation', '13. Shopping (Shopee/Clothes) - Shopping', '14. Healthcare - Health', '15. Additional courses - Education', '16. Consulting - Work', '17. Skincare & Cosmetics - Beauty', '18. Food & Drinks (restaurants, cafes) - Food', '', 'I hope this helps! Let me know if you have any other questions.']


In [25]:
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,Of course! Here are the appropriate categories...,Of course! Here are the appropriate categories...,
1,,,
2,1. Rent - Housing,1. Rent,Housing
3,2. Electricity (EVN) - Utilities,2. Electricity (EVN),Utilities
4,3. Water - Utilities,3. Water,Utilities
5,4. Internet (Viettel) - Technology,4. Internet (Viettel),Technology
6,5. Phone bill - Communication,5. Phone bill,Communication
7,6. Spotify Premium - Entertainment,6. Spotify Premium,Entertainment
8,7. Netflix - Entertainment,7. Netflix,Entertainment
9,8. Gym membership - Fitness,8. Gym membership,Fitness


In [26]:
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([None, 'Housing', 'Utilities', 'Technology', 'Communication',
       'Entertainment', 'Fitness', 'Education', 'Income', 'Work',
       'Transportation', 'Shopping', 'Health', 'Beauty', 'Food'],
      dtype=object)

In [27]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

In [30]:
# xoá numbering ở đầu như "1. ", "12) ", " 003. "
pat = r'^\s*\d+[\.\)]\s*'

for col in ['Transaction', 'Transaction vs category']:
    categories_df_all.loc[:, col] = (
        categories_df_all[col].astype(str)
        .str.replace(pat, '', regex=True)
        .str.strip(' "\'')              # bỏ ngoặc kép thừa
    )

categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
2,Rent - Housing,Rent,Housing
3,Electricity (EVN) - Utilities,Electricity (EVN),Utilities
4,Water - Utilities,Water,Utilities
5,Internet (Viettel) - Technology,Internet (Viettel),Technology
6,Phone bill - Communication,Phone bill,Communication
7,Spotify Premium - Entertainment,Spotify Premium,Entertainment
8,Netflix - Entertainment,Netflix,Entertainment
9,Gym membership - Fitness,Gym membership,Fitness
10,English course - Education,English course,Education
11,Monthly Salary - Income,Monthly Salary,Income


In [31]:
# Merge the categories_df_all with the transactions_2023_2024.csv dataframe (df)
df = pd.read_csv("transactions_2023_2024.csv")
df.loc[df['Name / Description'].str.contains("Spotify"), 'Name / Description'] = "Spotify Ab By Adyen"
df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (VND),Transaction vs category,Transaction,Category
0,2023-01-05,Rent,Expense,6000000,Rent - Housing,Rent,Housing
1,2023-01-05,Electricity (EVN),Expense,500000,Electricity (EVN) - Utilities,Electricity (EVN),Utilities
2,2023-01-05,Water,Expense,200000,Water - Utilities,Water,Utilities
3,2023-01-05,Internet (Viettel),Expense,250000,Internet (Viettel) - Technology,Internet (Viettel),Technology
4,2023-01-05,Phone bill,Expense,150000,Phone bill - Communication,Phone bill,Communication
...,...,...,...,...,...,...,...
322,2024-12-01,Monthly Salary,Income,21765767,Monthly Salary - Income,Monthly Salary,Income
323,2024-12-20,Shopping (Shopee/Clothes),Expense,215527,Shopping (Shopee/Clothes) - Shopping,Shopping (Shopee/Clothes),Shopping
324,2024-12-16,"Food & Drinks (restaurants, cafes)",Expense,204343,"Food & Drinks (restaurants, cafes) - Food","Food & Drinks (restaurants, cafes)",Food
325,2024-12-13,Shopping (Shopee/Clothes),Expense,288558,Shopping (Shopee/Clothes) - Shopping,Shopping (Shopee/Clothes),Shopping


In [32]:
df.to_csv("transactions_2023_2024_categorized.csv", index=False)