In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
import faiss
import numpy as np
import pandas as pd
import json

import os
from dotenv import load_dotenv

In [4]:
# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")

# Verify it's loaded correctly
if openai_api_key:
    print("OpenAI API Key loaded successfully.")
else:
    print("Failed to load OpenAI API Key.")

OpenAI API Key loaded successfully.


# Langchain

In [41]:
# Initialize LangChain OpenAI model with the API key
llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name="gpt-4o"
)

# Example usage
response = llm.predict("What is ENCIK TAN NTU NORTH SP ?")
print(response)

"Encik Tan NTU North SP" likely refers to a specific outlet of the Encik Tan food chain located at Nanyang Technological University (NTU) in Singapore. Encik Tan is known for offering a variety of local Singaporean hawker-style dishes such as Hainanese chicken rice, laksa, and nasi lemak. The "North SP" could refer to a specific part of the campus or a student plaza where the outlet is situated. For precise details, you might want to check NTU's campus dining directory or contact the university directly.


In [42]:
# Initialize OpenAI LLM
llm = ChatOpenAI(openai_api_key=openai_api_key, model_name="gpt-4o", temperature=0)

# Define Classification Prompt
prompt_template = PromptTemplate(
    input_variables=["description"],
    template="""
    Given the following transaction description, classify it into one of these categories:
    - Housing
    - Food
    - Transportation
    - Shopping
    - Entertainment
    - Utilities
    - Others
    
    If unsure, return "Others".

    Transaction Description:
    {description}

    Classification:
    """
)
chain = LLMChain(llm=llm, prompt=prompt_template)

In [43]:
description = "ENCIK TAN NTU NORTH SP SI SGP 04MAR"

chain.run(description=description).strip()

'Food'

In [33]:
description = "BUS/MRT 595432162      SI SGP 02MAR"

chain.run(description=description).strip()

'Transportation'

In [44]:
description = "SHOPEE SINGAPORE MP    SI SGP 05MAR"

chain.run(description=description).strip()

'Shopping'

In [45]:
description = "PayNow Transfer 9673376"

chain.run(description=description).strip()

'Others'

In [46]:
# FAISS for Embedding-Based Classification
embeddings_model = OpenAIEmbeddings()
index = faiss.IndexFlatL2(1536)  # Adjust embedding size

  embeddings_model = OpenAIEmbeddings()


# Chat completions API

In [2]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": "Write a one-sentence bedtime story about a unicorn."
        }
    ]
)

print(completion.choices[0].message.content)

As the moonlight shimmered on the enchanted meadow, a gentle unicorn named Luna spread her sparkling wings and soared through the starry sky, carrying the dreams of all the children to the land where wishes come true.


In [12]:
# Load the dataset
df = pd.read_csv("transactions_test.csv")

#df = df[df['Debit Amount'].str.strip().ne('')]

# Display first few rows
df

Unnamed: 0,Transaction Date,Value Date,Statement Code,Reference,Debit Amount,Credit Amount,Client Reference,Additional Reference,Misc Reference
0,11-Mar-25,,ADV,ICT,6,,PayNow Transfer 9983265,To: samul,OTHR PayNow Transfer


In [6]:
categorize_system_prompt = '''
Classify the following bank transaction into one of these categories: [housing, food, transportation, shopping, entertainment, utilities, others]
If you cannot determine the class, classify as: others
Output a json object containing the following information:
{
    categories: string // Category based on the transaction description,
}
'''

def get_categories(description):
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0,
    # This is to enable JSON mode, making sure responses are valid json objects
    response_format={ 
        "type": "json_object"
    },
    messages=[
        {
            "role": "system",
            "content": categorize_system_prompt
        },
        {
            "role": "user",
            "content": description
        }
    ],
    )

    return response.choices[0].message.content

In [7]:
# Testing on a few examples
for _, row in df[:5].iterrows():
    description = row['Client Reference']
    result = get_categories(description)
    print(f"TRANSACTION: {description}\n\nCLASS: {result}")
    print("\n\n----------------------------\n\n")

TRANSACTION: PayNow Transfer 9983265

CLASS: {
    "categories": "others"
}


----------------------------


TRANSACTION: PayNow Transfer 9673376

CLASS: {
    "categories": "others"
}


----------------------------


TRANSACTION: SHOPEE SINGAPORE MP    SI SGP 05MAR

CLASS: {
    "categories": "shopping"
}


----------------------------


TRANSACTION: BUS/MRT 595432162      SI SGP 02MAR

CLASS: {
    "categories": "transportation"
}


----------------------------


TRANSACTION: ENCIK TAN NTU NORTH SP SI SGP 04MAR

CLASS: {
    "categories": "others"
}


----------------------------




In [14]:
categorize_system_prompt = '''Classify the following bank transaction into one of these categories: [housing, food, transportation, shopping, entertainment, utilities, others]
If you cannot determine the class, classify as: others
Output a json object containing the following information:
{
    categories: string // Category based on the transaction description,
}
'''

# Creating an array of json tasks

tasks = []

for index, row in df.iterrows():
    
    description = row['Client Reference']
    
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o-mini",
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": description
                }
            ],
        }
    }
    
    tasks.append(task)

In [15]:
# Creating the file
file_name = "batch_tasks_transactions.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [60]:
# Upload batch file
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [49]:
print(batch_file)

FileObject(id='file-PAhGNCpo1jhaMUu7WAmmrC', bytes=618, created_at=1742028215, filename='batch_tasks_transactions.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


In [61]:
# Create batch job
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [62]:
import time

batch_job = client.batches.retrieve(batch_job.id)
while(batch_job.status == 'in_progress'):
    print(batch_job.status)
    batch_job = client.batches.retrieve(batch_job.id)
    time.sleep(3)

in_progress
in_progress
in_progress


In [93]:
client.batches.cancel("batch_67d545583c748190914607696c2cdd7b")

Batch(id='batch_67d545583c748190914607696c2cdd7b', completion_window='24h', created_at=1742030168, endpoint='/v1/chat/completions', input_file_id='file-YETYHuY2uj7Rg1kE1FiawZ', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1742030247, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742116568, failed_at=None, finalizing_at=None, in_progress_at=1742030169, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=92, failed=0, total=241))

In [106]:
import json

response = client.batches.list()
print(json.dumps(response.dict(), indent=4))

{
    "data": [
        {
            "id": "batch_67d58a75cc508190b2070220d262703c",
            "completion_window": "24h",
            "created_at": 1742047862,
            "endpoint": "/v1/chat/completions",
            "input_file_id": "file-YVB43f2ArWk5i4Lz396bE8",
            "object": "batch",
            "status": "in_progress",
            "cancelled_at": null,
            "cancelling_at": null,
            "completed_at": null,
            "error_file_id": null,
            "errors": null,
            "expired_at": null,
            "expires_at": 1742134262,
            "failed_at": null,
            "finalizing_at": null,
            "in_progress_at": 1742047863,
            "metadata": null,
            "output_file_id": null,
            "request_counts": {
                "completed": 0,
                "failed": 0,
                "total": 21
            }
        },
        {
            "id": "batch_67d57a6eff048190aaea310b57e0f6c3",
            "completion_window": "

C:\Users\User-PC\AppData\Local\Temp\ipykernel_29976\1711904476.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  print(json.dumps(response.dict(), indent=4))


In [107]:
from datetime import datetime

# Example: Unix timestamp
timestamp = 1742043759  # Replace with your timestamp

# Convert to datetime
dt = datetime.fromtimestamp(timestamp)

print(dt)  # Output: 2024-03-15 12:00:00 (Example)

2025-03-15 21:02:39


In [108]:
# Example: Unix timestamp
timestamp = 1742046564  # Replace with your timestamp

# Convert to datetime
dt = datetime.fromtimestamp(timestamp)

print(dt)  # Output: 2024-03-15 12:00:00 (Example)

2025-03-15 21:49:24


In [36]:
type(client.batches.list(limit=10))

openai.pagination.SyncCursorPage[Batch]

In [57]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

result_file_name = "batch_job_results_transactions.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [58]:
# Loading data from saved file
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [59]:
results[0]

{'id': 'batch_req_67d53dcac934819098bacd38ebeda1a7',
 'custom_id': 'task-0',
 'response': {'status_code': 200,
  'request_id': '93556d252ce0198999773cebaad4a7d9',
  'body': {'id': 'chatcmpl-BBHPOhItFvTiKvSynVq7mbUR7Mlol',
   'object': 'chat.completion',
   'created': 1742028226,
   'model': 'gpt-4o-mini-2024-07-18',
   'choices': [{'index': 0,
     'message': {'role': 'assistant',
      'content': '{\n    "categories": "others"\n}',
      'refusal': None,
      'annotations': []},
     'logprobs': None,
     'finish_reason': 'stop'}],
   'usage': {'prompt_tokens': 80,
    'completion_tokens': 10,
    'total_tokens': 90,
    'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
    'completion_tokens_details': {'reasoning_tokens': 0,
     'audio_tokens': 0,
     'accepted_prediction_tokens': 0,
     'rejected_prediction_tokens': 0}},
   'service_tier': 'default',
   'system_fingerprint': 'fp_06737a9306'}},
 'error': None}

In [70]:
# Reading only the first results
for res in results[:5]:
    task_id = res['custom_id']
    # Getting index from task id
    index = task_id.split('-')[-1]
    result = eval(res['response']['body']['choices'][0]['message']['content'])
    transaction = df.iloc[int(index)]
    description = transaction['Client Reference']
    print(f"TRANSACTION: {description}\n\nCATEGORY: {result['categories']}")
    # print(type(result))
    print("\n\n----------------------------\n\n")

TRANSACTION: PayNow Transfer 9983265

CATEGORY: others


----------------------------


TRANSACTION: PayNow Transfer 9673376

CATEGORY: others


----------------------------


TRANSACTION: SHOPEE SINGAPORE MP    SI SGP 05MAR

CATEGORY: shopping


----------------------------


TRANSACTION: BUS/MRT 595432162      SI SGP 02MAR

CATEGORY: transportation


----------------------------


TRANSACTION: ENCIK TAN NTU NORTH SP SI SGP 04MAR

CATEGORY: food


----------------------------




# FAISS

In [None]:
# Load the dataset
df = pd.read_csv("2adc191f4000bdb85bf059aacb96386.P000000058345059.csv")

# Display first few rows
df.head()

Unnamed: 0,Transaction Date,Value Date,Statement Code,Reference,Debit Amount,Credit Amount,Client Reference,Additional Reference,Misc Reference
0,11-Mar-25,,ADV,ICT,6.0,,PayNow Transfer 9983265,To: samul,OTHR PayNow Transfer
1,9-Mar-25,9-Mar-25,ADV,ICT,47.34,,PayNow Transfer 9673376,To: jj,OTHR PayNow Transfer
2,8-Mar-25,,POS,BAT,49.73,,SHOPEE SINGAPORE MP SI SGP 05MAR,4628-4500-4902-5001,
3,7-Mar-25,,POS,BAT,2.59,,BUS/MRT 595432162 SI SGP 02MAR,4628-4500-4902-5001,
4,7-Mar-25,,POS,BAT,5.0,,ENCIK TAN NTU NORTH SP SI SGP 04MAR,4628-4500-4902-5001,


In [49]:
df.columns

Index(['Transaction Date', 'Value Date', 'Statement Code', 'Reference',
       'Debit Amount', 'Credit Amount', 'Client Reference',
       'Additional Reference', ' Misc Reference'],
      dtype='object')

In [52]:
# Initialize OpenAI Embeddings
embeddings_model = OpenAIEmbeddings()

# Generate embeddings for transaction descriptions
transaction_texts = df["Client Reference"].astype(str).tolist()
transaction_embeddings = embeddings_model.embed_documents(transaction_texts)

In [57]:
# Convert embeddings to a NumPy array
embedding_dim = len(transaction_embeddings[0])  # Get the embedding dimension
index = faiss.IndexFlatL2(embedding_dim)  # L2 (Euclidean) index

# Convert list to NumPy array and add to FAISS
transaction_embeddings_np = np.array(transaction_embeddings, dtype=np.float32)
index.add(transaction_embeddings_np)

print(f"FAISS index created with {index.ntotal} transactions.")

FAISS index created with 100 transactions.


In [58]:
faiss.write_index(index, "faiss_transaction_index.idx")
df.to_csv("transactions_with_embeddings.csv", index=False)  # Save transaction data

In [60]:
# Reload FAISS index
index = faiss.read_index("faiss_transaction_index.idx")
print(f"FAISS index loaded with {index.ntotal} transactions.")


FAISS index loaded with 100 transactions.


# Monthly Expenses Summariser

In [22]:
from myapp.blueprints.plan.schemas import FinancialReport

In [51]:
from openai import OpenAI
client = OpenAI()

summarise_system_prompt = '''
You are a financial assistant specializing in analyzing financial behavior. Your goal is to summarize a user's spending habits based on the financial data provided by the user. Answer like you are talking to the user. Use the provided numbers and statistics in your answer.
Return your answer according to the following JSON structure and definitions:
{
    'summary': str, //A short summary to summarize how the user spent their money
    'top_3_categories': List(str), //A list of strings containing 3 elements. Each str represents a comment on each of the top 3 categories that the user spent their money on.
    'recommendations':[
        {
            "header": str, // short title for the reccomendation
            "description": str, // reccomendation for the user to save money
        }
    ] //List of 3 objects
}

'''
user_data='''
#Expenses breakdown
[
{"category":"others","dollar_amount":103.34, "percentage":60},
{"category":"shopping","dollar_amount":49.73, "percentage":30},
{"category":"food","dollar_amount":12.9, "percentage":8},
{"category":"transportation","dollar_amount":2.59, "percentage":2}
]

#Monthly overview
{
"income":0,
"expenses":169,
}
'''

def get_financial_report(user_data):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        temperature=0,
        messages=[
            {"role": "system", "content": summarise_system_prompt},
            {"role": "user", "content": user_data},
        ],
        response_format=FinancialReport,
    )

    return completion.choices[0].message

In [52]:
result = get_financial_report(user_data).parsed

In [53]:
result

FinancialReport(summary="This month, you spent a total of $169, with the majority of your expenses falling into the 'others' category, followed by shopping and food.", top_3_categories=["The 'others' category accounted for the largest portion of your spending at 60%, totaling $103.34. This suggests a significant amount of your expenses are not categorized, which might include miscellaneous or unexpected costs.", 'Shopping was your second largest expense, making up 30% of your total spending, amounting to $49.73. This indicates a notable portion of your budget is allocated to purchasing goods.', 'Food expenses were relatively low, comprising 8% of your total spending, which is $12.90. This suggests you are either managing your food costs well or possibly underreporting in this category.'], recommendations=[':{', 'description', 'header', 'description', 'header', 'description', 'header'])

In [86]:
from openai import OpenAI
client = OpenAI()

summarise_system_prompt = '''
You are a financial assistant specializing in analyzing financial behavior. Your goal is to summarize a user's spending habits based on the financial data provided by the user,
while providing helpful financial advice for the user to save money. Answer like you are talking to the user. Use the provided numbers and statistics in your answer.
Return your answer according to the following JSON structure and definitions:
{
'summary': str, //Comment on the user's spending habits in 2-3 sentences
'spending':[{'name': str, //name of category
'percentage': int, //percentage spent on the category
'description': str //analysis of spending behaviour
}] //List of top 3 categories that the user spent their money on.
'recommendations':[{'header': str, //short title for the reccomendation
'description': str, //reccomendation for the user to save money, in about 2-3 sentences
'icon': str //Bootstrap icon class that is relevent to the header }] //List of 3 objects
}
'''
user_data='''
#Expenses breakdown
[
{"category":"others","dollar_amount":103.34, "percentage":60},
{"category":"shopping","dollar_amount":49.73, "percentage":30},
{"category":"food","dollar_amount":12.9, "percentage":8},
{"category":"transportation","dollar_amount":2.59, "percentage":2}
]

#Monthly overview
{
"income":0,
"expenses":169,
}
'''

def get_summary(description):
    response = client.chat.completions.create(
    model="gpt-4.5-preview",
    temperature=0,
    # This is to enable JSON mode, making sure responses are valid json objects
    response_format={ 
        "type": "json_object"
    },
    messages=[
        {
            "role": "system",
            "content": summarise_system_prompt
        },
        {
            "role": "user",
            "content": description
        }
    ],
    )

    return response.choices[0].message.content

In [87]:
ans = get_summary(user_data)
eval(ans)

{'summary': "Your spending primarily goes towards miscellaneous items, shopping, and food, with a significant 60% spent on miscellaneous expenses. Given that your income is currently at $0, it's important to closely monitor and control your spending to avoid financial strain.",
 'spending': [{'name': 'others',
   'percentage': 60,
   'description': "The majority of your spending (60%) is categorized as 'others', indicating miscellaneous or unplanned expenses. This suggests a need for clearer categorization or budgeting to better understand and control these expenses."},
  {'name': 'shopping',
   'percentage': 30,
   'description': 'Shopping accounts for 30% of your expenses, which is relatively high considering your current lack of income. It would be beneficial to evaluate the necessity of these purchases and prioritize essential items.'},
  {'name': 'food',
   'percentage': 8,
   'description': 'Food expenses are relatively low at 8%, indicating you are managing this category well. C