In [1]:
import os
import numpy as np
import re
import json
from bs4 import BeautifulSoup
import html
from sec_edgar_downloader import Downloader
# import nltk
# nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import subprocess




### Download data from SEC

data info: CAPITAL ONE FINANCIAL CORP latest financial report

In [2]:
dl = Downloader("MyCompanyName", "email@example.com", os.getcwd())
dl.get("10-K", "COF", limit=1)  # Download the most recent 10-K for Capital One

1

### Read text file

In [3]:
# Load the text file
file_path = "sec-edgar-filings/COF/10-K/0000927628-24-000094/full-submission.txt"

with open(file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

print(raw_text[:500])

<SEC-DOCUMENT>0000927628-24-000094.txt : 20240223
<SEC-HEADER>0000927628-24-000094.hdr.sgml : 20240223
<ACCEPTANCE-DATETIME>20240222181208
ACCESSION NUMBER:		0000927628-24-000094
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		174
CONFORMED PERIOD OF REPORT:	20231231
FILED AS OF DATE:		20240223
DATE AS OF CHANGE:		20240222

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			CAPITAL ONE FINANCIAL CORP
		CENTRAL INDEX KEY:			0000927628
		STANDARD INDUSTRIAL CLASSIFICATION:	NATIONAL COMMER


### Text cleaning

In [4]:
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Decode HTML-encoded characters
    text = html.unescape(text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text.strip())
    return text

In [5]:
cleaned_text = clean_text(raw_text)
print("Sample cleaned text:", cleaned_text[:500])  # Verify the cleaning process

Sample cleaned text: 0000927628-24-000094.txt : 20240223 0000927628-24-000094.hdr.sgml : 20240223 20240222181208 ACCESSION NUMBER: 0000927628-24-000094 CONFORMED SUBMISSION TYPE: 10-K PUBLIC DOCUMENT COUNT: 174 CONFORMED PERIOD OF REPORT: 20231231 FILED AS OF DATE: 20240223 DATE AS OF CHANGE: 20240222 FILER: COMPANY DATA: COMPANY CONFORMED NAME: CAPITAL ONE FINANCIAL CORP CENTRAL INDEX KEY: 0000927628 STANDARD INDUSTRIAL CLASSIFICATION: NATIONAL COMMERCIAL BANKS [6021] ORGANIZATION NAME: 02 Finance IRS NUMBER: 54171


### Split text into chunks

In [6]:
# Tokenize into sentences (you may need to install nltk: pip install nltk)
sentences = sent_tokenize(cleaned_text)

# Group sentences into chunks of ~500 words
chunk_size = 500
chunks = []
current_chunk = []

for sentence in sentences:
    current_chunk.append(sentence)
    if len(" ".join(current_chunk)) > chunk_size:
        chunks.append(" ".join(current_chunk))
        current_chunk = []

if current_chunk:  # Add remaining sentences
    chunks.append(" ".join(current_chunk))

print(f"Total chunks: {len(chunks)}")
print(chunks[0])  # Check the first chunk


Total chunks: 7901
0000927628-24-000094.txt : 20240223 0000927628-24-000094.hdr.sgml : 20240223 20240222181208 ACCESSION NUMBER: 0000927628-24-000094 CONFORMED SUBMISSION TYPE: 10-K PUBLIC DOCUMENT COUNT: 174 CONFORMED PERIOD OF REPORT: 20231231 FILED AS OF DATE: 20240223 DATE AS OF CHANGE: 20240222 FILER: COMPANY DATA: COMPANY CONFORMED NAME: CAPITAL ONE FINANCIAL CORP CENTRAL INDEX KEY: 0000927628 STANDARD INDUSTRIAL CLASSIFICATION: NATIONAL COMMERCIAL BANKS [6021] ORGANIZATION NAME: 02 Finance IRS NUMBER: 541719854 STATE OF INCORPORATION: DE FISCAL YEAR END: 1231 FILING VALUES: FORM TYPE: 10-K SEC ACT: 1934 Act SEC FILE NUMBER: 001-13300 FILM NUMBER: 24666554 BUSINESS ADDRESS: STREET 1: 1680 CAPITAL ONE DRIVE STREET 2: SUITE 1400 CITY: MCLEAN STATE: VA ZIP: 22102 BUSINESS PHONE: 7037201000 MAIL ADDRESS: STREET 1: 1680 CAPITAL ONE DRIVE STREET 2: SUITE 1400 CITY: MCLEAN STATE: VA ZIP: 22102 FORMER COMPANY: FORMER CONFORMED NAME: OAKSTONE FINANCIAL CORP DATE OF NAME CHANGE: 19940728 1

###  Store chunks in a structured format

In [7]:
# Save chunks as a JSON file
output_file = "financial_report_chunks.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(chunks, file, ensure_ascii=False, indent=4)

print(f"Chunks saved to {output_file}")

Chunks saved to financial_report_chunks.json


### Generate embedding
all-MiniLM-L6-v2:

The all-MiniLM-L6-v2 model is a lightweight sentence-transformer model designed for creating high-quality sentence embeddings. It is optimized for efficiency and effectiveness, making it ideal for use cases where performance and resource constraints are important.

In [8]:
# Load a pre-trained model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight and free to use

# Generate embeddings for all chunks
embeddings = model.encode(chunks, convert_to_tensor=False)

print(f"Generated {len(embeddings)} embeddings with dimension {len(embeddings[0])}")

Generated 7901 embeddings with dimension 384


### Store embeddings in FAISS

In [9]:
# Convert embeddings to a NumPy array
embedding_dim = len(embeddings[0])  # Get embedding dimensions
embedding_array = np.array(embeddings, dtype="float32")

# Initialize a FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search

# Add embeddings to the index
index.add(embedding_array)

# Save the index for future use
faiss.write_index(index, "financial_report_index.faiss")

print("Embeddings stored in FAISS index.")

Embeddings stored in FAISS index.


### Save metadata for retrieval

In [10]:
metadata = [{"chunk": chunk, "id": idx} for idx, chunk in enumerate(chunks)]

# Save metadata as JSON
metadata_file = "financial_report_metadata.json"
with open(metadata_file, "w", encoding="utf-8") as file:
    json.dump(metadata, file, ensure_ascii=False, indent=4)

print(f"Metadata saved to {metadata_file}")

Metadata saved to financial_report_metadata.json


### Retrieval from FAISS

In [11]:
def retrieve_chunks(query, index, metadata, model):
    # Encode the query
    query_embedding = model.encode([query], convert_to_tensor=False)[0]

    # Search FAISS for the top 3 most similar chunks
    top_k = 3
    distances, indices = index.search(np.array([query_embedding], dtype="float32"), top_k)

    # Retrieve the corresponding chunks from metadata
    results = []
    for idx in indices[0]:
        if idx < len(metadata):
            results.append(metadata[idx]["chunk"])

    return results


### Context augmentation

In [12]:
def create_prompt(query, retrieved_chunks):
    # Combine retrieved chunks into a clear and structured context
    context = "\n\n".join(retrieved_chunks)

    # Create a more explicit and task-specific prompt
    prompt = f"""
    You are a highly knowledgeable assistant trained on financial reports.
    Based on the provided context, answer the following question accurately.
    Use ONLY the context below.  
    If unsure, say "I don't know".  
    Keep answers under 2 sentences.  

    Context:
    {context}

    Question:
    {query}

    Provide a concise and factual answer:
    """
    return prompt



### Answer generation using llama2 from Ollama

In [13]:
def generate_with_ollama(prompt, model_name="llama2"):
    result = subprocess.run(
        ["ollama", "run", model_name, "--text", prompt],
        stdout=subprocess.PIPE,
        text=True
    )
    print("Raw Output from Ollama:", result.stdout)  # Debugging
    return result.stdout


In [14]:
def generate_with_ollama(prompt, model_name="llama2"):
    try:
        # Run the subprocess command
        result = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,  # Provide the prompt as input
            text=True,  # Treat input/output as text
            stdout=subprocess.PIPE,  # Capture standard output
            stderr=subprocess.PIPE,  # Capture errors
        )

        # Return the standard output
        return result.stdout.strip()
    except Exception as e:
        print("Error during Ollama subprocess call:", str(e))
        return ""


### Integrating the RAG pipeline

In [15]:
def rag_pipeline(query, index, metadata, retrieval_model):
    # Step 1: Retrieve chunks
    retrieved_chunks = retrieve_chunks(query, index, metadata, retrieval_model)

    # Step 2: Create the prompt
    prompt = create_prompt(query, retrieved_chunks)

    # Step 3: Generate the answer
    answer = generate_with_ollama(prompt)

    return answer


### Example query

In [16]:
# Load FAISS index and metadata
index = faiss.read_index("financial_report_index.faiss")
with open("financial_report_metadata.json", "r", encoding="utf-8") as file:
    metadata = json.load(file)

# Test query 1
query1 = "What was Capital One's net income in 2023?"
answer1 = rag_pipeline(query1, index, metadata, model)

print("Answer:", answer1)

# Test query 2
query2 = "What was the address of Capital One office?"
answer2 = rag_pipeline(query2, index, metadata, model)

print("Answer:", answer2)

Answer: According to the context provided, Capital One's net income in 2023 was $7.4 billion.
Answer: The address of Capital One's principal executive office is 1680 Capital One Drive, McLean, Virginia 22102.
