# Kikiola 10-K Embedding with Hugging Face 🤗

## Installing Required Libraries

In [None]:
!pip install requests pydantic transformers torch

## Starting the Server

Before running the Kikiola Genome Embedding code, make sure to start the server by running the following command in your terminal. This command will start the server that will handle the storage of the generated embeddings.

```sh
go run cmd/main.go
```

## Kikiola 10-K Embedding Code

In [None]:
import requests
import uuid
from typing import List, Optional
from pydantic import BaseModel, Field
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
import torch

class Kikiola10KEmbedding:
    def __init__(self):
        self.embeddings = []
        self.items_map = {}
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0'}
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    def get_accession_numbers(self, cik: str):
        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        submissions_response = requests.get(submissions_url, headers=self.headers)

        try:
            json: dict = submissions_response.json()
        except requests.exceptions.JSONDecodeError as e:
            print(f"Error decoding JSON response from {submissions_url}:")
            print(f"Status code: {submissions_response.status_code}")
            print(f"Response content: {submissions_response.content}")
            raise e

        filings: dict = json.get("filings", {})
        recent_filings: dict = filings.get("recent", {})
        forms: List[str] = recent_filings.get("form", [])
        accession_numbers: List[str] = recent_filings.get("accessionNumber", [])

        form_indices = [index for index, form in enumerate(forms) if form == "10-K"]
        accession_numbers_set = {accession_numbers[index] for index in form_indices}

        return accession_numbers_set

    def get_xbrl_data(self, cik: str):
        xbrl = requests.get(f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json", headers=self.headers)
        json_data = xbrl.json()
        company_name = json_data.get("entityName")
        facts = json_data.get("facts")
        gaap_data = facts.get("us-gaap")

        financials_dict = defaultdict(lambda: defaultdict(list))

        for key, value in gaap_data.items():
            units = value.get("units")
            financials: list = units.get("USD")
            if not financials:
                continue
            for financial in financials:
                form = financial.get("form")
                if form != '10-K':
                    continue

                accession_number = financial.get("accn")
                if accession_number not in self.accession_numbers:
                    continue

                value = financial.get("val")
                fp = financial.get("fp")
                fy = financial.get("fy")
                year = fp + str(fy)

                financials_dict[year][key].append({"value": value, "year": year})

        return financials_dict

    def extract_income_statements(self, financials_dict):
        class IncomeStatement(BaseModel):
            period: Optional[str]
            revenue: Optional[float] = Field(description="Revenue")
            cost_of_revenue: Optional[float] = Field(description="Cost of revenue")
            general_and_administrative_expense: Optional[float] = Field(description="General and administrative expenses")
            research_and_development_expense: Optional[float] = Field(description="Research and development expenses")
            selling_and_marketing_expense: Optional[float] = Field(description="Selling and marketing expenses")
            operating_income_loss: Optional[float] = Field(description="Operating income loss")
            net_income_loss: Optional[float] = Field(description="Net income or loss")

        class IncomeStatements(BaseModel):
            income_statements: List[IncomeStatement]

        income_statements = IncomeStatements(income_statements=[])
        self.items_map["income_statements"] = income_statements.model_dump_json(indent=2)

    def extract_balance_sheets(self, financials_dict):
        class BalanceSheet(BaseModel):
            period: Optional[str]
            cash_and_cash_equivalents: Optional[float] = Field(description="Cash and cash equivalents")
            short_term_investments: Optional[float] = Field(description="Short-term investments")
            total_current_assets: Optional[float] = Field(description="Total current assets")
            goodwill: Optional[float] = Field(description="Goodwill")
            total_assets: Optional[float] = Field(description="Assets")
            current_accrued_liabilities: Optional[float] = Field(description="Current accrued liabilities")
            current_accounts_payable: Optional[float] = Field(description="Current accounts payable")
            long_term_debt: Optional[float] = Field(description="Long term debt")
            operating_lease_liabilities: Optional[float] = Field(description="Operating lease liabilities")
            other_non_current_liabilities: Optional[float] = Field(description="Other non-current liabilities")
            total_liabilities: Optional[float] = Field(description="Liabilities")
            stockholders_equity: Optional[float] = Field(description="Stockholders equity")

        class BalanceSheets(BaseModel):
            balance_sheets: List[BalanceSheet]

        balance_sheets = BalanceSheets(balance_sheets=[])
        self.items_map["balance_sheets"] = balance_sheets.model_dump_json(indent=2)

    def extract_cash_flow_statements(self, financials_dict):
        class CashFlowStatement(BaseModel):
            period: Optional[str]
            net_income: Optional[float] = Field(description="Net income")
            depreciation_and_amortization: Optional[float] = Field(description="Depreciation and amortization")
            shared_based_compensation: Optional[float] = Field(description="Shared or stock-based compensation")
            net_cash_from_operating_activities: Optional[float] = Field(description="Net cash provided by operating activities")
            net_cash_from_investing_activities: Optional[float] = Field(description="Net cash provided by investing activities")
            plant_property_and_equipment: Optional[float] = Field(description="Payments to acquire property plant and equipment")
            net_cash_from_financing_activities: Optional[float] = Field(description="Net cash provided by financing activities")

        class CashFlowStatements(BaseModel):
            cash_flow_statements: List[CashFlowStatement]

        cash_flow_statements = CashFlowStatements(cash_flow_statements=[])
        self.items_map["cash_flow_statements"] = cash_flow_statements.model_dump_json(indent=2)

    def generate_embeddings(self) -> None:
        self.embeddings = []
        document_uuid = str(uuid.uuid4())

        for item_name, item_text in self.items_map.items():
            if item_text.strip():
                encoded_input = self.tokenizer(item_text, return_tensors='pt', truncation=True, max_length=512, padding=True)
                with torch.no_grad():
                    model_output = self.model(**encoded_input)
                embeddings = model_output.last_hidden_state.mean(dim=1).squeeze().tolist()

                self.embeddings.append({
                    "ID": f"{document_uuid}_{item_name.replace(' ', '')}",
                    "Embedding": embeddings,
                    "Metadata": {
                        "name": "sec_filing",
                        "category": "securities"
                    },
                    "Text": item_text
                })

    def store_embeddings(self) -> None:
        server_url = "http://localhost:3400/vectors"
        for vector_data in self.embeddings:
            print(f"Vector data: {vector_data}")
            try:
                response = requests.post(server_url, json=vector_data)
                if response.status_code == 200:
                    print(f"Embeddings stored for {vector_data['ID']}. Status code: {response.status_code}")
                else:
                    print(f"Error storing embeddings for {vector_data['ID']}. Status code: {response.status_code}")
                    print(f"Error response: {response.text}")
            except requests.exceptions.RequestException as e:
                print(f"Error storing embeddings for {vector_data['ID']}: {e}")

    def process_10k(self, cik: str):
        self.accession_numbers = self.get_accession_numbers(cik)
        financials_dict = self.get_xbrl_data(cik)

        self.extract_income_statements(financials_dict)
        self.extract_balance_sheets(financials_dict)
        self.extract_cash_flow_statements(financials_dict)

        self.generate_embeddings()
        self.store_embeddings()

embedding = Kikiola10KEmbedding()
embedding.process_10k("0001559720")
print("Kikiola 10-K Embeddings Completed.")