**Install required libraries**

pip install sec-edgar-downloader sec-api beautifulsoup4 langchain-core langchain-text-splitters langchain-community qdrant-client pandas

In [None]:
pip install sec-edgar-downloader sec-api beautifulsoup4 langchain-core langchain-text-splitters langchain-community qdrant-client pandas sentence-transformers transformers torch accelerate

**Create directory to store**

In [None]:
import os

folder_name = "sec_filings"
os.makedirs(folder_name, exist_ok=True)

**Download all the required data**

Format link: https://sec-api.io/resources/extract-googles-revenue-metrics-from-10-k-filings-with-python

In [None]:
from sec_edgar_downloader import Downloader

# Replace this with your actual name and email as SEC requires a user-agent string
USER_AGENT_COMPANY_NAME = "XYZ_COMPANY_NAME"
USER_AGENT_EMAIL = "work.email@mail.com"

# Initialize the downloader
dl = Downloader(USER_AGENT_COMPANY_NAME, USER_AGENT_EMAIL, download_folder="sec_filings")

# List of tickers (sample tickers provided — replace or expand)
tickers = {"AAPL" : "Apple", "MSFT" : "Microsoft", "AMZN" : "Amazon", "GOOGL" : "Google",
           "META" : "Meta", "BRK-B" : "Berkshire", "JNJ" : "Johnson", "V" : "Visa",
           "PG" : "ProcterGamble", "TSLA" : "Tesla"}

# Download the last 4 years' 10-K filings for each ticker
for ticker in tickers:
    print(f"Downloading 10-K filings for {ticker}...")
    try:
        dl.get("10-K", ticker, limit=4)  # last 4 filings (usually 1 per year)
    except Exception as e:
        print(f"Error downloading {ticker}: {e}")

print("Download completed.")

**Change the extension of txt files to html**

In [None]:
import os
base_folder = r"/content/sec_filings"
for root, _, file in os.walk(base_folder):
        if file and file[0].lower().endswith(".txt"):
            new_file_name = os.path.splitext(file[0])[0] + ".html" # rename txt file to html file
            old_file_name = os.path.join(root, file[0])
            new_file_name = os.path.join(root, new_file_name)
            try:
                os.rename(old_file_name, new_file_name)
            except Exception as e:
                print(f"Error caught in {root}")

print("File extensions were changed sucessfully!")

**Get the financial data and store it in csv files**

In [None]:
import pandas as pd
from sec_api import XbrlApi
from google.colab import userdata

SEC_API_KEY = userdata.get('SEC_API_KEY')
xbrlApi = XbrlApi(SEC_API_KEY)

def get_income_statement(xbrl_json):
    income_statement_store = {}

    # iterate over each US GAAP item in the income statement
    for usGaapItem in xbrl_json['StatementsOfIncome']:
        values = []
        indicies = []

        for fact in xbrl_json['StatementsOfIncome'][usGaapItem]:
            # only consider items without segment. not required for our analysis.
            if 'segment' not in fact:
                index = fact['period']['startDate'] + '-' + fact['period']['endDate']
                # ensure no index duplicates are created
                if index not in indicies:
                    values.append(fact['value'])
                    indicies.append(index)

        income_statement_store[usGaapItem] = pd.Series(values, index=indicies)

    income_statement = pd.DataFrame(income_statement_store)
    # switch columns and rows so that US GAAP items are rows and each column header represents a date range
    return income_statement.T

file_list = []
for root, dir, file in os.walk('/content/sec_filings/sec-edgar-filings'):
  if len(dir) == 4:
    dir = sorted(dir)
    stock_name = tickers[root.split('/')[4]]

    xbrl_json_1 = xbrlApi.xbrl_to_json(accession_no=dir[0])
    xbrl_json_2 = xbrlApi.xbrl_to_json(accession_no=dir[1])
    xbrl_json_3 = xbrlApi.xbrl_to_json(accession_no=dir[2])
    xbrl_json_4 = xbrlApi.xbrl_to_json(accession_no=dir[3])

    income_statement_1 = get_income_statement(xbrl_json_1)
    income_statement_2 = get_income_statement(xbrl_json_2)
    income_statement_3 = get_income_statement(xbrl_json_3)
    income_statement_4 = get_income_statement(xbrl_json_4)

    income_statements_merged = pd.concat([income_statement_1,
                                          income_statement_2,
                                          income_statement_3,
                                          income_statement_4], axis=1, sort=False)

    duplicates = income_statements_merged.columns[income_statements_merged.columns.duplicated()]
    income_statements_merged = income_statements_merged.loc[:, ~income_statements_merged.columns.duplicated()]
    income_statements = income_statements_merged.reindex(sorted(income_statements_merged.columns), axis=1)

    income_statements.to_csv(f'sec_filings/{stock_name}_income_statements.csv')
    print(f"Parsed income statement for {stock_name}")

print("All statements were successfully parsed")


**Delete existing qdrant collection(s) if required**

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import CollectionDescription
from google.colab import userdata

# Replace with your Qdrant Cloud endpoint and API key
QDRANT_URL = userdata.get('QDRANT_URL')
API_KEY = userdata.get('QDRANT_API_KEY')

# Connect to Qdrant Cloud
client = QdrantClient(
    url=QDRANT_URL,
    api_key=API_KEY,
)

# Get all collections
collections = client.get_collections().collections  # List[CollectionDescription]

# Delete each collection
for collection in collections:
    name = collection.name
    print(f"Deleting collection: {name}")
    client.delete_collection(collection_name=name)

print("✅ All collections deleted.")


Create vector using the downloaded data

In [None]:
import os
import re
import sys
import pandas as pd
from google.colab import userdata
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from qdrant_client.models import PointStruct, VectorParams, Distance, PayloadSchemaType#, PayloadIndexParams

# --- Qdrant config ---
QDRANT_URL = userdata.get('QDRANT_URL')
QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
COLLECTION_NAME = "Company_Finances"

# 🔹 Metric descriptions dictionary
metric_descriptions = {
    "RevenueFromContractWithCustomerExcludingAssessedTax": "Revenue from contracts with customers",
    "CostOfRevenue": "Cost of producing and delivering products",
    "ResearchAndDevelopmentExpense": "Expenses for research and development",
    "SellingAndMarketingExpense": "Expenses related to selling and marketing",
    "GeneralAndAdministrativeExpense": "General and administrative overhead",
    "CostsAndExpenses": "Total operating costs and expenses",
    "OperatingIncomeLoss": "Net operating income or loss",
    "NonoperatingIncomeExpense": "Non-operating income or expense",
    "IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest": "Income before taxes from continuing operations",
    "IncomeTaxExpenseBenefit": "Taxes paid or refunded",
    "NetIncomeLoss": "Net income after taxes",
    "EarningsPerShareBasic": "Earnings per share (basic)",
    "EarningsPerShareDiluted": "Earnings per share (diluted)"
}

# 🔹 Load the BGE Large embedding model
model = SentenceTransformer("BAAI/bge-large-en")

# 🔹 Connect to Qdrant
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# 🔹 Embed with instruction prompt
def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    prompt_texts = [f"Represent this sentence for searching relevant passages: {t}" for t in texts]
    return model.encode(prompt_texts, normalize_embeddings=True)

# 🔹 Create (or recreate) the Qdrant collection
if client.collection_exists(collection_name=COLLECTION_NAME):
    try:
        client.delete_collection(collection_name=COLLECTION_NAME)
    except Exception as e:
        print(f"Error deleting collection: {e}")
        sys.exit("Execution stopped: Vector deletion failed")

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(
        size=1024,
        distance=Distance.COSINE
    )
)

# ✅ Add index for filtering
client.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="company",
    field_schema=PayloadSchemaType.KEYWORD
)

print(f"Collection '{COLLECTION_NAME}' created.")

# 🔹 Embed descriptions and upload
descriptions = list(metric_descriptions.values())
embeddings = embed(descriptions)

income_statement = None
for file in os.listdir('/content/sec_filings'):
    if file.endswith('.csv'):
        income_statement = pd.read_csv(os.path.join('/content/sec_filings', file))
        income_statement.set_index(income_statement.columns[0], inplace=True)
        for col in income_statement.columns:
          income_statement[col] = income_statement[col].astype(str).str.replace('[\$,]', '', regex=True)
          try:
              income_statement[col] = pd.to_numeric(income_statement[col])
          except Exception:
              pass

        points = []
        for i, (metric, desc) in enumerate(metric_descriptions.items()):
            if metric not in income_statement.index:
                # print(f"Continued for {file}")
                continue

            # point_id = f"{file.split('_')[0]}_{i}"
            point_id = int(hash(f"{file.split('_')[0]}_{metric}") % (10 ** 8))
            vector = embeddings[i]
            values = income_statement.loc[metric].to_dict()
            points.append(
                PointStruct(
                    id=point_id,
                    vector=vector,
                    payload={
                        "company": file.split('_')[0],
                        "metric": metric,
                        "description": desc,
                        "values": values
                    }
                )
            )

        if points:
          client.upsert(collection_name=COLLECTION_NAME, points=points)
          print(f"✅ Uploaded {len(points)} metrics to Qdrant for {file.split('_')[0]}.")
        else:
          print(f"No points to upload for {file}")

print("All dataframes were successfully parsed")

Get vector output based on the query

In [None]:
import re
from google.colab import userdata
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

# === Qdrant Configuration ===
QDRANT_URL = userdata.get('QDRANT_URL')
QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
COLLECTION_NAME = "Company_Finances"

# === Initialize Qdrant Client ===
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# === Load embedding model ===
model = SentenceTransformer("BAAI/bge-large-en")

def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    prompts = [f"Represent this sentence for searching relevant passages: {t}" for t in texts]
    return model.encode(prompts, normalize_embeddings=True)

# === Extract years from query (e.g., 2021, 2022) ===
def extract_years_from_query(query):
    return re.findall(r"\b(20[1-2][0-9]|202[0-5])\b", query)

# === Extract company from query based on known list ===
def extract_company_from_query(query, company_list):
    query_lower = query.lower()
    for company in company_list:
        if company.lower() in query_lower:
            return company
    return None

# === Main function to search Qdrant ===
def search_financials(query, known_companies=None):
    query_vector = embed(query)[0]
    filter_years = extract_years_from_query(query)
    filter_company = extract_company_from_query(query, known_companies or [])

    conditions = []
    if filter_company:
        conditions.append(FieldCondition(key="company", match=MatchValue(value=filter_company)))

    search_filter = Filter(must=conditions) if conditions else None

    response = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=5,
        query_filter=search_filter
    )

    print(f"\n🔍 Results for: '{query}'")
    if filter_company:
        print(f"🏢 Filtered by company: {filter_company}")
    print(f"📅 Showing: {', '.join(filter_years) if filter_years else 'All Years'}")

    for result in response:
        payload = result.payload
        metric = payload.get("metric", "N/A")
        description = payload.get("description", "N/A")
        values = payload.get("values", {})

        print(f"\n• {metric} — {description} (score: {result.score:.4f})")
        for year, val in values.items():
            if not filter_years or any(y in year for y in filter_years):
                formatted_val = f"${val:,.2f}" if isinstance(val, (int, float)) else val
                print(f"   - {year}: {formatted_val}")

# === Example ===
known_companies = ["Apple","Microsoft", "Amazon", "Google", "Meta", "Berkshire", "Johnson", "Visa", "ProcterGamble", "Tesla"]
search_financials(query="Which company had highest profit in 2021", known_companies=known_companies)
