In [61]:
import os
from google import genai
from google.genai import types
import pathlib
import httpx
from dotenv import load_dotenv

load_dotenv()

system_prompt = """
  You are a seasoned financial analyst reviewing a newly submitted financial document.
  Your task is to summarize the submission clearly and concisely for stakeholders such as investors, executives, or credit officers.

  Analyze the document and produce a structured summary using the format below.
  If data is not available, say "Not disclosed." Use clean markdown formatting.

  ---

  ### 📄 1. Document Overview
  - **Document Type**:  
  - **Company Name & Ticker (if public)**:  
  - **Reporting Period**:  
  - **Date of Submission / Release**:  
  - **Auditor (if applicable)**:  
  - **Currency**:  

  ---

  ### 📊 2. Key Financial Metrics
  - **Revenue**:  
  - **Operating Profit / EBIT**:  
  - **Net Income**:  
  - **EPS (Basic & Diluted)**:  
  - **Free Cash Flow**:  
  - **Key Ratios**:
    - Gross Margin  
    - Operating Margin  
    - Net Margin  
    - ROE / ROA  
    - Debt-to-Equity  
    - Current Ratio  

  ---

  ### 🔍 3. Performance Highlights
  - Revenue and margin drivers  
  - Cost trends (COGS, SG&A, R&D)  
  - Operational efficiency comments  

  ---

  ### 🧾 4. Balance Sheet Snapshot
  - **Cash & Equivalents**  
  - **Total Assets**  
  - **Total Liabilities**  
  - **Shareholder Equity**  
  - Notable changes in structure or working capital

  ---

  ### 💵 5. Cash Flow Overview
  - **Operating Activities**:  
  - **Investing Activities**:  
  - **Financing Activities**:  
  - Major capital movements  

  ---

  ### 📈 6. Forward Guidance / Outlook
  - Management guidance (if available)  
  - Risks or opportunities  

  ---

  ### ⚠️ 7. Analyst Notes & Red Flags
  - Auditor or regulatory concerns  
  - Related party or insider issues  
  - Liquidity warnings or covenant risks  

  ---

  ### 🧩 8. Additional Context
  - Strategic initiatives (e.g., M&A, restructuring)  
  - ESG or sustainability disclosures  
  - Industry comparison if relevant
"""

In [18]:
def summarize_document(document_path):
    """
    Summarize a financial document using Gemini API.
    
    Args:
        document_path (str): Path to the financial document.

    Returns:
        str: Summary of the document.
    """
    client = genai.Client(api_key=os.getenv("SUMMARIZER_API_KEY"))

    # Upload the PDF using the File API
    file = client.files.upload(
        file=document_path,
    )

    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25",
        config=types.GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0,
        ),
        contents=[file]
    )

    return response.text


# a function to write the summary to a file
def write_summary_to_file(summary, original_file_path):
    """
    Write a summary to a file
    
    Args:
        summary (str): Summary of the document.
        original_file_path (str): Path to the original file.
        
    Returns:
        str: Path to the summary file.
    """
    
    # get the file name from the original file path and change extension to .txt
    file_name = os.path.splitext(os.path.basename(original_file_path))[0] + '.txt'
    # get the directory from the original file path
    file_dir = os.path.dirname(original_file_path)
    # create the summaries directory path
    summaries_dir = os.path.join(file_dir, "summaries")
    
    # Create the summaries directory if it doesn't exist
    os.makedirs(summaries_dir, exist_ok=True)
    
    # create the summary file path
    summary_file_path = os.path.join(summaries_dir, file_name)
    
    with open(summary_file_path, "w") as f:
        f.write(summary)
        
    return summary_file_path

#  function to summarize and write to file
def summarize_and_save(document_path):
    """
    Summarize a financial document and write the summary to a file.
    
    Args:
        document_path (str): Path to the financial document.
        
    Returns:
        str: Path to the summary file.
    """
    summary = summarize_document(document_path)
    return write_summary_to_file(summary, document_path)


In [15]:
for file in os.listdir("pdfs"):
    if file.endswith(".pdf"):
        print(file)
        summarize_and_save(os.path.join("pdfs", file))
        print(f"Summarized {file}")
        print("--------------------------------")


EduFocal-Limited-2023-Financial-Statements.pdf
Summarized EduFocal-Limited-2023-Financial-Statements.pdf
--------------------------------
ONE-Audited-Financial-Statement-Year-Ended-31-August-2024-1 (1).pdf
Summarized ONE-Audited-Financial-Statement-Year-Ended-31-August-2024-1 (1).pdf
--------------------------------
Sagicor-Group-Jamaica-Limited-SJ-Audited-Financial-Statements-for-the-year-ended-December-31-2024.pdf
Summarized Sagicor-Group-Jamaica-Limited-SJ-Audited-Financial-Statements-for-the-year-ended-December-31-2024.pdf
--------------------------------
Sagicor-Group-Jamaica-Limited-SJ-Audited-Financial-Statements-for-the-Year-Ended-December-31-2023.pdf
Summarized Sagicor-Group-Jamaica-Limited-SJ-Audited-Financial-Statements-for-the-Year-Ended-December-31-2023.pdf
--------------------------------
NCB-Financial-Group-Limited-NCBFG-Audited-Financial-Statements-for-Year-Ended-September-30-2024-3.pdf
Summarized NCB-Financial-Group-Limited-NCBFG-Audited-Financial-Statements-for-Year-E

# Create Vector Store

In [22]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from vertexai.language_models import TextEmbeddingModel
import re

google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.getenv("SUMMARIZER_API_KEY"))

# Initialize the Chroma client
client = chromadb.PersistentClient()

# Create a collection for financial document summaries
collection = client.create_collection(
    name="fin_doc_summaries",
    embedding_function=google_ef,
    metadata={
        "hnsw:space": "cosine",
        "hnsw:search_ef": 100
    }
)

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# Read the summaries into a list
summaries = []
metadata = []
for file in os.listdir("pdfs/summaries"):
    with open(os.path.join("pdfs/summaries", file), "r") as f:
        # read the file and add the summary to the summaries list
        summaries.append(f.read())
        
        company_name = file.split("-")[0]
        
        # Extract year from filename with validation
        year_match = re.search(r'\d{4}', file)
        year = year_match.group() if year_match else "Unknown"
        if year != "Unknown":
            try:
                year_int = int(year)
                # Validate year is within reasonable range (e.g., between 1900 and 2100)
                if year_int < 1900 or year_int > 2100:
                    year = "Unknown"
            except ValueError:
                year = "Unknown"
        
        metadata.append({
            "company_name": company_name,
            "year": year
        })
        
# Add the summaries to the collection
collection.add(
    ids=[file.split(".")[0] for file in os.listdir("pdfs/summaries")],
    documents=summaries,
    metadatas=metadata
)

## Example Use

In [48]:
import os
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from google import genai
from google.genai import types
from vertexai.language_models import TextEmbeddingModel
import re
import dotenv

dotenv.load_dotenv()

google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.getenv("SUMMARIZER_API_KEY"))

# Initialize the Chroma client
client = chromadb.PersistentClient()

# Get the collection
collection = client.get_collection(name="fin_doc_summaries", embedding_function=google_ef)

In [49]:
query = "How did EduFocal do in 2023?"

In [55]:
import json
from rapidfuzz import process, fuzz

json_file = "companies.json"
with open(json_file, "r") as f:
    companies = json.load(f)
    
# Create a lookup dictionary
lookup = {}
for company in companies:
    for name in [company["security_name"], company["short_name"], company["ticker_symbol"]]:
        lookup[name.lower()] = company["short_name"]  # Normalize case
        
def fuzzy_match_company(query):
    """Try to match company names using fuzzy matching."""
    matches = {}
    for name in lookup.keys():
        score = fuzz.partial_ratio(query.lower(), name)
        if score > 80:  # Tune threshold
            matches[name] = lookup[name]
    
    return list(set(matches.values()))

def extract_companies_with_llm(query):
    """Fallback method: Use an LLM to extract company names."""
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite-001",
        config=types.GenerateContentConfig(
            system_instruction="Your task is to retrun a comma separated list of the companies mentioned in the user's prompt.",
            temperature=0,
        ),
        contents=[query]
    )
    extracted_names = response.text
    
    # convert the extracted names to a list
    extracted_names = extracted_names.split(", ")
    return extracted_names

def get_companies_from_query(query):
    """Hybrid approach: Fuzzy matching first, then fallback to LLM."""
    companies_found = fuzzy_match_company(query)

    if not companies_found:  # If no match, try LLM
        print("No fuzzy match found, trying LLM")
        llm_extracted = extract_companies_with_llm(query)
        companies_found = [lookup.get(name.lower()) for name in llm_extracted if name.lower() in lookup]

    return companies_found

company_matches = get_companies_from_query(query)

def query_chromadb_sorted(collection, query, n_results=5):
    """
    Queries ChromaDB, retrieves matching documents, sorts them by year (most recent first),
    and formats the results into a context string for an LLM.

    Args:
        collection: ChromaDB collection object.
        query (str): User's query.
        n_results (int): Number of results to retrieve.

    Returns:
        Tuple[List[Tuple[Dict, str]], str]: 
            - Sorted list of (metadata, document) tuples.
            - A formatted context string with sorted document summaries.
    """

    # Step 1: Get Company Matches
    company_matches = get_companies_from_query(query)

    # Step 2: Query ChromaDB
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where={
            "company_name": {"$in": company_matches}
        }
    )

    # Step 2: Extract Metadata & Documents
    metadata_results = results.get('metadatas', [])
    document_results = results.get('documents', [])

    # Step 3: Flatten Nested Metadata & Document Lists
    flattened_metadata = [item for sublist in metadata_results for item in sublist]
    flattened_documents = [item for sublist in document_results for item in sublist]

    # Step 4: Sort Metadata & Documents by Year (Descending)
    sorted_results = sorted(
        zip(flattened_metadata, flattened_documents),
        key=lambda pair: int(pair[0]['year']),  # Convert 'year' to int for correct sorting
        reverse=True
    )

    # Step 5: Generate Context String from Sorted Documents
    context = "\n\n".join([doc for _, doc in sorted_results])

    return sorted_results, context  # Returns both structured data & formatted context

def qa_bot(query: str, contexts: str):
    """
    A function to answer questions about the financial document summaries.
    
    Args:
        query (str): The question to answer.
        contexts (str): The contexts of the financial document summaries.

    Returns:
        str: The answer to the question.
    """
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    
    # QA System Prompt
    qa_system_prompt = """
        You are an experienced financial analyst. Your primary task is to answer user questions about financial topics based *solely* on the content of the provided financial document summaries. Your goal is to provide not just factually accurate but also insightful responses that directly address the user's query by synthesizing information and identifying key relationships within the provided documents.

        **Strict Guidelines:**

        * **Insightful Analysis Based on Facts:** You must base your insights and analysis *exclusively* on the information explicitly stated or logically implied within the provided summaries. Aim to connect different pieces of information, identify trends, and explain the significance of the data in relation to the user's question.
        * **No Fabrication or External Information:** Under no circumstances should you make up information, invent scenarios, or bring in knowledge from outside the provided financial document summaries.
        * **Handling Questions Beyond the Summaries:**
            * If the answer to the user's question requires information or analysis not explicitly present or logically derivable from the provided summaries, respond with: "Based on the provided document summaries, I cannot offer a more detailed or insightful analysis on this specific aspect." Then guide the user on how to find the answer.
            * If the question is unrelated to the financial document summaries, respond with: "This question falls outside the scope of the provided financial document summaries, and therefore I cannot offer an insightful response."
        * **Handling Unclear Questions:** If the user's question is ambiguous or lacks sufficient detail to provide an insightful response, politely ask for clarification. For example: "To provide a more insightful analysis, could you please specify which aspect of [topic] you are most interested in?" or "Could you please provide more context regarding [specific element] so I can offer a more insightful perspective based on the documents?"

        **Focus:** Provide concise yet comprehensive answers that directly address the user's query with insights derived solely from the provided financial document summaries. Aim to explain the "why" behind the numbers and trends where the information allows, without making assumptions or introducing external data.
    """

    # Create a prompt for the question and context
    prompt = f"Question: {query}\nContext: {contexts}"

    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25",
        config=types.GenerateContentConfig(
            system_instruction=qa_system_prompt,
            temperature=0,
        ),
        contents=[prompt]
    )

    return response.text

In [56]:
# Example Usage
sorted_results, context = query_chromadb_sorted(collection, query)

# Print sorted results
# for meta, doc in sorted_results:
#     print(f"Year: {meta['year']} | Company: {meta['company_name']} | Summary: {doc}")
#     print("--------------------------------")

# Print formatted context
print("\n--- Context for LLM ---\n")
print(context)


--- Context for LLM ---

Okay, here is the structured summary of the EduFocal Limited Consolidated Financial Statements for the year ended December 31, 2023.

---

### 📄 1. Document Overview
- **Document Type**: Consolidated Financial Statements (Audited)
- **Company Name & Ticker (if public)**: EduFocal Limited (JSE: LEARN)
- **Reporting Period**: Year Ended December 31, 2023
- **Date of Submission / Release**: June 19, 2024 (Based on Auditor's Report date and Board Approval date)
- **Auditor (if applicable)**: Baker Tilly
- **Currency**: Jamaican Dollars (JMD / $)

---

### 📊 2. Key Financial Metrics
- **Revenue**: $263,542,463 (up 40.6% from $187,436,765 in 2022)
- **Operating Profit / EBIT**: ($35,156,692) Operating Loss (vs $3,726,385 Operating Profit in 2022)
- **Net Income**: ($79,480,386) Net Loss (vs ($178,809,735) Net Loss in 2022)
- **EPS (Basic & Diluted)**: Basic: ($0.12) (vs ($0.29) in 2022). Diluted: Not disclosed.
- **Free Cash Flow**: ($4,045,123) (Calculated as Opera

In [57]:
answer = qa_bot(query, context)
print(answer)

Based on the provided financial statement summaries for 2023 and 2022, here's an analysis of EduFocal Limited's performance in 2023:

EduFocal's performance in 2023 presented a mixed picture characterized by strong revenue growth offset by significant operational challenges and deteriorating financial health.

1.  **Revenue Growth:** The company achieved substantial top-line growth, with revenue increasing by 40.6% to $263.5 million from $187.4 million in 2022. This indicates successful market penetration or expansion of services.

2.  **Profitability Challenges:**
    *   **Operating Loss:** Despite higher revenue, EduFocal swung from an operating profit of $3.7 million in 2022 to a significant operating loss of $35.2 million in 2023. This was driven by a sharp 62.6% increase in Administrative & Operating Expenses, which outpaced revenue growth. Key contributors to this expense surge were a very large increase in Bad Debt expense ($74.7M vs $9.0M) and higher Staff Costs ($98.3M vs $63

# Using BigQuery as the Vectorstore

In [16]:
import os
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# --- Configuration ---
PROJECT_ID = "jse-datasphere"  # Your Google Cloud project ID
LOCATION = "us-central1"      # The region for Vertex AI endpoint.
MODEL_NAME = "text-embedding-004" # Or "textembedding-gecko@003", "text-multilingual-embedding-002", etc.
TEXT_FILE_PATH = "pdfs/summaries/ONE-2023-Audited-Financial-Statement-Year-Ended-31-August-2023.txt" # Path to your local text file
# --- End Configuration ---

def generate_embedding_from_file(project_id: str, location: str, model_name: str, file_path: str):
    """Generates text embeddings from a local text file using Vertex AI."""

    print(f"Initializing Vertex AI client for project {project_id} in {location}...")
    aiplatform.init(project=project_id, location=location)

    try:
        print(f"Reading text content from: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            text_content = f.read()

        # Check if the text content is empty
        if not text_content.strip():
             print("Error: Text file is empty or contains only whitespace.")
             return None

        print(f"Loading embedding model: {model_name}")
        # Use the specific model class if available and preferred
        # from vertexai.language_models import TextEmbeddingModel
        # model = TextEmbeddingModel.from_pretrained(model_name)
        # embeddings = model.get_embeddings([text_content])

        # Or use the general Endpoint approach (good for different model types)
        # Note: Endpoint approach might require finding the specific endpoint ID
        # Let's use the TextEmbeddingModel class for clarity with foundation models
        from vertexai.language_models import TextEmbeddingModel

        model = TextEmbeddingModel.from_pretrained(model_name)

        print("Generating embedding...")
        # The get_embeddings method expects a list of texts
        embeddings_response = model.get_embeddings([text_content])

        # Check for errors (though SDK might raise exceptions)
        # Handle potential errors if the response structure includes status
        # For TextEmbeddingModel, errors usually raise exceptions handled below.

        # Extract the embedding vector
        # The response is a list of TextEmbedding objects
        if embeddings_response and len(embeddings_response) > 0:
            embedding_vector = embeddings_response[0].values
            print(f"Successfully generated embedding vector (length: {len(embedding_vector)})")
            # print("Embedding vector (first 10 values):", embedding_vector[:10]) # Optionally print part of it
            return embedding_vector
        else:
            print("Error: No embedding vector received.")
            return None

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Main execution --

In [None]:
vector = generate_embedding_from_file(PROJECT_ID, LOCATION, MODEL_NAME, TEXT_FILE_PATH)

https://cloud.google.com/bigquery/docs/vector-index-text-search-tutorial