In [1]:
import faiss
import ollama
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load embedding model (Open-source)
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [3]:
# Load financial data (preprocessed CSV)
data = pd.read_csv("./dataset/financial_statements.csv")

# Function to clean text and form structured sentences
def format_financial_data(row):
    return (f"In {row['Month Name']} {row['Year']}, the {row['Segment']} sector in {row['Country']} sold "
            f"{row['Units Sold']} units of {row['Product']} at a sale price of {row['Sale Price']} per unit. "
            f"The total sales amounted to {row['Sales']}, with a gross profit of {row['Profit']}.")

# Apply the function to each row
data['clean_text'] = data.apply(format_financial_data, axis=1)

# Convert financial statements into embeddings
texts = data['clean_text'].tolist()

print(texts[0])


In January 2014, the Government sector in Canada sold  $1,618.50  units of Carretera at a sale price of $20.00 per unit. The total sales amounted to  $32,370.00 , with a gross profit of  $16,185.00 .


In [4]:
# Convert financial statements into embeddings
texts = data['clean_text'].tolist()
embeddings = embed_model.encode(texts, convert_to_tensor=True)
print(embeddings[0])

tensor([ 9.9547e-02,  2.1763e-02, -1.2955e-02, -1.0208e-02, -1.0775e-01,
        -1.0586e-02, -6.2206e-02,  3.4004e-02,  6.7637e-02,  2.1248e-02,
        -5.9799e-03, -4.7451e-02,  1.4361e-02, -2.8145e-02, -2.1158e-02,
        -4.5615e-02, -2.7064e-02,  2.2019e-02,  9.9154e-03,  1.7305e-02,
         3.7309e-02,  1.0952e-02, -2.9702e-02,  2.5012e-02,  2.1566e-02,
        -1.0691e-01, -1.5715e-02, -3.9335e-02, -4.1445e-02,  3.4915e-03,
        -4.1596e-02,  1.7384e-02,  7.0941e-03,  5.3281e-02,  9.1478e-02,
        -4.4843e-03,  3.3772e-02, -2.4732e-02,  2.0439e-02,  1.6347e-02,
        -6.5890e-03,  6.4806e-03, -8.8352e-02,  1.4330e-02, -3.6171e-02,
        -4.7078e-02,  2.0151e-02,  7.6801e-02,  6.1197e-02,  3.8871e-02,
         2.4813e-02,  1.0695e-01, -5.0199e-02, -6.6919e-02,  5.4020e-02,
         3.8580e-02, -1.8639e-02, -1.0110e-01,  1.3434e-01, -2.7246e-02,
         1.3144e-02,  3.5347e-02,  2.8851e-02, -3.9708e-02, -1.2595e-02,
        -6.4783e-02, -5.3076e-02,  8.8111e-03, -8.0

In [5]:
# Initialize FAISS vector DB
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings.cpu().numpy())

def search_faiss(query, k=5):
    """Performs dense retrieval using FAISS."""
    query_embedding = embed_model.encode([query], convert_to_tensor=True)
    D, I = index.search(query_embedding.cpu().numpy(), k)
    return [texts[i] for i in I[0]]


In [6]:
def bm25_search(query, corpus, k=5):
    """Performs sparse retrieval using BM25."""
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    scores = bm25.get_scores(query.split())
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [corpus[i] for i in ranked_indices[:k]]

def hybrid_search(query, k=5):
    """Combines FAISS (dense) and BM25 (sparse) retrieval."""
    dense_results = search_faiss(query, k)
    sparse_results = bm25_search(query, texts, k)
    combined_results = list(set(dense_results + sparse_results))[:k]
    return combined_results

def rerank_results(query, retrieved_docs):
    """Re-ranks retrieved documents using cross-encoder."""
    scores = cross_encoder.predict([(query, doc) for doc in retrieved_docs])
    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
    return ranked_docs


In [7]:
def generate_response(query, context):
    """Generates response using Llama via Ollama with retrieved context."""
    prompt = f"""Given the following financial statements, answer the query:
    Context: {context}
    Query: {query}
    Answer:"""
    response = ollama.chat(model="llama3", messages=[{"role": "user", "content": prompt}])
    return response["message"]["content"]

def guardrail_input(query):
    """Filters inappropriate queries using Llama via Ollama."""
    guardrail_prompt = f"""Is the following query financial-related and safe?
    Query: {query}
    Response (Yes/No):"""
    response = ollama.chat(model="llama3", messages=[{"role": "user", "content": guardrail_prompt}])
    return "yes" in response["message"]["content"].strip().lower()

def guardrail_output(response):
    """Filters hallucinated/misleading outputs using Llama via Ollama."""
    check_prompt = f"""Does the following response contain financial inaccuracies or hallucinations?
    Response: {response}
    Answer (Yes/No):"""
    response = ollama.chat(model="llama3", messages=[{"role": "user", "content": check_prompt}])
    return "no" in response["message"]["content"].strip().lower()

In [8]:
def process_financial_query(query):
    if not guardrail_input(query):
        return("This query is not financial-related or is unsafe.")
    else:
        results = hybrid_search(query, k=5)
        ranked_results = rerank_results(query, results)
        response = generate_response(query, " ".join(ranked_results))

    if not guardrail_output(response):
        return ("The generated response may be misleading. Please verify with official sources.")
    else:
        return(response)

In [12]:
# Do chat for about 3 times for three queries with different confidence (high, medium, low) using qualitative scores based on domain relevancy

for i in range(3):
    query = input("Enter your query: ")
    response = process_financial_query(query)
    print("-"*50)
    print(f"Query: {query}")
    print("*"*50)
    print(f"Response: {response}")
    print("-"*50)

--------------------------------------------------
Query: Which company is profitable enough to invest?"
**************************************************
Response: A clever question!

To answer this query, we need to analyze the financial statements and identify which sector or entity has a positive net income (profit). Let's break it down:

1. Enterprise sector in USA (Carretera):
	* Total sales: $3,874,618.75
	* Gross profit: $25,841.25
2. Enterprise sector in USA (Velo):
	* Total sales: $2,590,375
	* Gross profit: $33,522.50
3. Enterprise sector in France (Paseo):
	* Total sales: $2,832,187.50
	* Gross profit: $2,981.25
4. Midmarket sector in Canada (Paseo):
	* Total sales: $3,139.20
	* Gross profit: $959.20
5. Government sector in France (Montana):
	* Total sales: $3,693.76
	* Gross profit: $973.76

After analyzing the financial statements, we can conclude that:

* The Enterprise sector in USA (Velo) is profitable enough to invest, with a gross profit of $33,522.50.
* The Enterpr