In [11]:
import os
from google import genai
from google.genai import types
import pathlib
import httpx
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
# Debug environment variables
print("Environment variables loaded:")
print(f"GOOGLE_APPLICATION_CREDENTIALS: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}")
print(f"Current working directory: {os.getcwd()}")
print(f"Files in current directory: {os.listdir('.')}")

# Check if .env file exists
env_path = os.path.join(os.getcwd(), '.env')
print(f"\n.env file exists: {os.path.exists(env_path)}")
if os.path.exists(env_path):
    print("Contents of .env file:")
    with open(env_path, 'r') as f:
        print(f.read())

Environment variables loaded:
GOOGLE_APPLICATION_CREDENTIALS: /app/credentials.json
Current working directory: /Users/galbraithelroy/Documents/jse-datasphere-chatbot
Files in current directory: ['.cursor', 'chroma', '.DS_Store', 'requirements.txt', 'Dockerfile', 'frontend.py', 'summarizer.ipynb', 'utils', '__pycache__', 'README.md', 'api.py', '.dockerignore', '.gitignore', '.env', 'companies.json', 'service-account', 'app.py', 'pdfs', 'docker-compose.yml', 'example_performance.pdf', 'conversation_history.txt', 'venv', '.env.example', '.git', 'conversation_history (1).txt', 'create_docker_files.bat', 'src']

.env file exists: True
Contents of .env file:
AWS_ACCESS_KEY_ID="AKIA5OYA47HDS66YLUM6"
AWS_SECRET_ACCESS_KEY="tNzXvW/Gf+Vewhsneo6td13aKbhgM1HS0YH8coRQ"
AWS_DEFAULT_REGION="us-east-1"
GOOGLE_APPLICATION_CREDENTIALS="service-account/credentials.json"
SUMMARIZER_API_KEY="AIzaSyDGYwVPjy4kLsgn-AencoKOoCwRKD3bL-8"
CHATBOT_API_KEY="AIzaSyB-i3zVwwPSo86VpZ2GGZ2RFsd73-tqsrQ"
METADATA_FILE_PAT

## Financial Document Prompt

In [2]:
system_prompt = """
  You are a seasoned financial analyst reviewing a newly submitted financial document.
  Your task is to summarize the submission clearly and concisely for stakeholders such as investors, executives, or credit officers.

  Analyze the document and produce a structured summary using the format below.
  If data is not available, say "Not disclosed." Use clean markdown formatting.

  ---

  ### 📄 1. Document Overview
  - **Document Type**:  
  - **Company Name & Ticker (if public)**:  
  - **Reporting Period**:  
  - **Date of Submission / Release**:  
  - **Auditor (if applicable)**:  
  - **Currency**:  

  ---

  ### 📊 2. Key Financial Metrics
  - **Revenue**:  
  - **Operating Profit / EBIT**:  
  - **Net Income**:  
  - **EPS (Basic & Diluted)**:  
  - **Free Cash Flow**:  
  - **Key Ratios**:
    - Gross Margin  
    - Operating Margin  
    - Net Margin  
    - ROE / ROA  
    - Debt-to-Equity  
    - Current Ratio  

  ---

  ### 🔍 3. Performance Highlights
  - Revenue and margin drivers  
  - Cost trends (COGS, SG&A, R&D)  
  - Operational efficiency comments  

  ---

  ### 🧾 4. Balance Sheet Snapshot
  - **Cash & Equivalents**  
  - **Total Assets**  
  - **Total Liabilities**  
  - **Shareholder Equity**  
  - Notable changes in structure or working capital

  ---

  ### 💵 5. Cash Flow Overview
  - **Operating Activities**:  
  - **Investing Activities**:  
  - **Financing Activities**:  
  - Major capital movements  

  ---

  ### 📈 6. Forward Guidance / Outlook
  - Management guidance (if available)  
  - Risks or opportunities  

  ---

  ### ⚠️ 7. Analyst Notes & Red Flags
  - Auditor or regulatory concerns  
  - Related party or insider issues  
  - Liquidity warnings or covenant risks  

  ---

  ### 🧩 8. Additional Context
  - Strategic initiatives (e.g., M&A, restructuring)  
  - ESG or sustainability disclosures  
  - Industry comparison if relevant
"""

## Non-financial Document Prompt

In [3]:
non_findoc_sys_prompt ="""
    You are a corporate analyst reviewing a non-financial document submitted by a company.  
    Your job is to extract and summarize the most relevant qualitative insights that provide context for investors, executives, or decision-makers.  
    These documents may include investor presentations, management letters, ESG disclosures, strategic plans, or earnings call transcripts.

    Analyze the document and generate a structured summary using the format below.  
    If any information is not provided, state "Not disclosed." Use clean markdown formatting.

    ---

    ### 📄 0. Document Overview
    - **Document Type**:  
    - **Company Name & Ticker (if public)**:  
    - **Date of Submission / Release**:  
    - **Author / Division**:  
    - **Purpose of Document**:  

    ---
    
    ### 🏢 1. Company Overview
    - **Company Name**:  
    - **Headquarters / Region**:  
    - **CEO / Key Executives**:  
    - **Board of Directors (if disclosed)**:  
    - **Business Segments / Focus Areas**:  
 
    ---

    ### 🧭 2. Strategic Themes & Objectives
    - **Primary goals or initiatives discussed**:  
    - **Target markets, segments, or geographies**:  
    - **Key operational or structural changes (e.g., M&A, partnerships)**:  

    ---

    ### 📊 3. Business Drivers & Risks
    - **Growth strategies or opportunity areas**:  
    - **Competitive positioning / market commentary**:  
    - **Risks, headwinds, or concerns raised**:  

    ---

    ### 🌿 4. ESG & Governance (if applicable)
    - **Environmental goals or initiatives**:  
    - **Social impact or workforce developments**:  
    - **Governance or compliance updates**:  

    ---

    ### 🔮 5. Forward Outlook & Implications
    - **Management tone or sentiment**:  
    - **Implications for upcoming financial performance**:  
    - **Signals for strategic or operational shifts**:  

    ---

    ### 📌 6. Analyst Notes & Takeaways
    - Alignment with prior financial results or guidance  
    - Notable changes in strategy, tone, or risk profile  
    - Items to monitor in future disclosures  

    ---
"""

## Summarizer Functions

In [4]:
def summarize_document(document_path, system_prompt):
    """
    Summarize a financial document using Gemini API.
    
    Args:
        document_path (str): Path to the financial document.
        system_prompt (str): The system prompt for the Gemini API.

    Returns:
        str: Summary of the document.
    """
    client = genai.Client(api_key=os.getenv("SUMMARIZER_API_KEY"))

    # Upload the PDF using the File API
    file = client.files.upload(
        file=document_path,
    )

    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25",
        config=types.GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0,
        ),
        contents=[file]
    )

    return response.text


# a function to write the summary to a file
def write_summary_to_file(summary, original_file_path):
    """
    Write a summary to a file
    
    Args:
        summary (str): Summary of the document.
        original_file_path (str): Path to the original file.
        
    Returns:
        str: Path to the summary file.
    """
    
    # get the file name from the original file path and change extension to .txt
    file_name = os.path.splitext(os.path.basename(original_file_path))[0] + '.txt'
    # get the directory from the original file path
    file_dir = os.path.dirname(original_file_path)
    # create the summaries directory path
    summaries_dir = os.path.join(file_dir, "summaries")
    
    # Create the summaries directory if it doesn't exist
    os.makedirs(summaries_dir, exist_ok=True)
    
    # create the summary file path
    summary_file_path = os.path.join(summaries_dir, file_name)
    
    with open(summary_file_path, "w") as f:
        f.write(summary)
        
    return summary_file_path

#  function to summarize and write to file
def summarize_and_save(document_path, system_prompt):
    """
    Summarize a financial document and write the summary to a file.
    
    Args:
        document_path (str): Path to the financial document.
        system_prompt (str): The system prompt for the Gemini API.
        
    Returns:
        str: Path to the summary file.
    """
    summary = summarize_document(document_path, system_prompt)
    return write_summary_to_file(summary, document_path)


In [18]:
for file in os.listdir("pdfs"):
    if file.endswith(".pdf"):
        print(file)
        # Check if the file name contains "financial"
        if "financial" in file.lower():
            print("Summarizing financial document")
            summarize_and_save(os.path.join("pdfs", file), system_prompt)
        else:
            print("Summarizing non-financial document")
            summarize_and_save(os.path.join("pdfs", file), non_findoc_sys_prompt)
        print(f"Summarized {file}")
        print("--------------------------------")


EduFocal-Limited-2023-Financial-Statements.pdf
Summarizing financial document
Summarized EduFocal-Limited-2023-Financial-Statements.pdf
--------------------------------
one_on_one_educational_services_limited-one-director_appointments_resignations_and_retirements-2023-08-20.pdf
Summarizing non-financial document
Summarized one_on_one_educational_services_limited-one-director_appointments_resignations_and_retirements-2023-08-20.pdf
--------------------------------
ONE-Audited-Financial-Statement-Year-Ended-31-August-2024-1 (1).pdf
Summarizing financial document
Summarized ONE-Audited-Financial-Statement-Year-Ended-31-August-2024-1 (1).pdf
--------------------------------
edufocal_limited-learn-annual_report-2022.pdf
Summarizing non-financial document
Summarized edufocal_limited-learn-annual_report-2022.pdf
--------------------------------
edufocal_limited-learn-management_appointments_resignations_and_retirements-2023-04-28.pdf
Summarizing non-financial document
Summarized edufocal_limi

# Create Vector Store

In [19]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from vertexai.language_models import TextEmbeddingModel
import re

google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.getenv("SUMMARIZER_API_KEY"))

# Initialize the Chroma client
client = chromadb.PersistentClient()

# Delete the collection
client.delete_collection(name="doc_summaries")

# Create a collection for financial document summaries
collection = client.get_or_create_collection(
    name="doc_summaries",
    embedding_function=google_ef,
    metadata={
        "hnsw:space": "cosine",
        "hnsw:search_ef": 100
    }
)

In [20]:
import json
from rapidfuzz import fuzz
import re

json_file = "companies.json"
with open(json_file, "r") as f:
    companies = json.load(f)
    
# Create a lookup dictionary
lookup = {}
for company in companies:
    for name in [company["security_name"], company["short_name"], company["ticker_symbol"]]:
        lookup[name.lower()] = company["short_name"]  # Normalize case
        
def normalize_company_name(query):
    """Normalize company names using fuzzy matching."""
    for name in lookup.keys():
        score = fuzz.partial_ratio(query.lower(), name)
        if score > 80:
            return lookup[name]
    return None
    
# Read the summaries into a list
summaries = []
metadata = []
for file in os.listdir("pdfs/summaries"):
    with open(os.path.join("pdfs/summaries", file), "r") as f:
        # read the file and add the summary to the summaries list
        summaries.append(f.read())
        
        company_name = file.split("-")[0]
        company_name = normalize_company_name(company_name)
        
        # Extract year from filename with validation
        year_match = re.search(r'\d{4}', file)
        year = year_match.group() if year_match else "Unknown"
        if year != "Unknown":
            try:
                year_int = int(year)
                # Validate year is within reasonable range (e.g., between 1900 and 2100)
                if year_int < 1900 or year_int > 2100:
                    year = "Unknown"
            except ValueError:
                year = "Unknown"
        
        metadata.append({
            "company_name": company_name,
            "year": year,
            "file_type": "financial" if "financial" in file.lower() else "non-financial"
        })
        
# Add the summaries to the collection
collection.add(
    ids=[file.split(".")[0] for file in os.listdir("pdfs/summaries")],
    documents=summaries,
    metadatas=metadata
)

## Example Use

In [21]:
import os
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from google import genai
from google.genai import types
import dotenv

dotenv.load_dotenv()

google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.getenv("SUMMARIZER_API_KEY"))

# Initialize the Chroma client
client = chromadb.PersistentClient()

# Get the collection
fin_collection = client.get_collection(name="doc_summaries", embedding_function=google_ef)

In [22]:
import json
from rapidfuzz import fuzz

json_file = "companies.json"
with open(json_file, "r") as f:
    companies = json.load(f)
    
# Create a lookup dictionary
lookup = {}
for company in companies:
    for name in [company["security_name"], company["short_name"], company["ticker_symbol"]]:
        lookup[name.lower()] = company["short_name"]  # Normalize case
        
def fuzzy_match_company(query):
    """Try to match company names using fuzzy matching."""
    matches = {}
    for name in lookup.keys():
        score = fuzz.partial_ratio(query.lower(), name)
        if score > 80:  # Tune threshold
            matches[name] = lookup[name]
    
    return list(set(matches.values()))

def extract_companies_with_llm(query):
    """Fallback method: Use an LLM to extract company names."""
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite-001",
        config=types.GenerateContentConfig(
            system_instruction="Your task is to retrun a comma separated list of the companies mentioned in the user's prompt.",
            temperature=0,
        ),
        contents=[query]
    )
    extracted_names = response.text
    
    # convert the extracted names to a list
    extracted_names = extracted_names.split(", ")
    return extracted_names

def get_companies_from_query(query):
    """Hybrid approach: Fuzzy matching first, then fallback to LLM."""
    companies_found = fuzzy_match_company(query)

    if not companies_found:  # If no match, try LLM
        print("No fuzzy match found, trying LLM")
        llm_extracted = extract_companies_with_llm(query)
        companies_found = [lookup.get(name.lower()) for name in llm_extracted if name.lower() in lookup]

    return companies_found

def get_doctype_from_query(query):
    """Get the document type from the query.
    Args:
        query (str): The query to get the document type from.

    Returns:
        str: The document type.
    """
    
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite-001",
        config=types.GenerateContentConfig(
            system_instruction="""
                You are a document classification assistant helping to identify the most relevant source type for answering user queries: financial documents, non-financial documents, or both.

                Your task is to analyze the user's query and determine **which type of document(s)** would best provide a useful and accurate response, in order to minimize irrelevant or noisy context.

                Respond in the following format:

                    Justification: <your justification>
                    Label: <financial | non-financial | both>

                Where:
                - Your **justification** explains your reasoning clearly and concisely.
                - Your **response** must be one of the following (exact match):  
                - 'financial' → for queries that rely on metrics, earnings, ratios, cash flows, or balance sheet information.  
                - 'non-financial' → for queries that depend on business strategy, ESG initiatives, leadership tone, risks, or narrative insights.  
                - 'both' → for queries that require a combination of numeric financial data *and* strategic or qualitative context.

                Think carefully and respond accurately to ensure the correct documents are used to answer the query.
                
                For example:
                Query: "What is the revenue growth of Company X in 2023?"
                Justification: "The query is about financial metrics and data."
                Label: "financial"
                
                Query: "What is the strategic direction of Company Y?"
                Justification: "The query is about business strategy and non-financial information."
                Label: "non-financial"
                
                Query: "Summarize Company Z's performance in 2023."
                Justification: "Summarizing performance typically includes both financial results (e.g., revenue, profit) and qualitative drivers (e.g., market conditions, management commentary)."
                Label: "both"
                """,
            temperature=0,
        ),
        contents=[query]
    )
    last_line = response.text.lower().strip().split("\n")[-1].strip('"')
    doctype_map = {
        "label: both": ["financial", "non-financial"],
        "label: non-financial": ["non-financial"],
        "label: financial": ["financial"]
    }
    return doctype_map.get(last_line, ["unknown"])

def query_chromadb_sorted(collection, query, n_results=5):
    """
    Queries ChromaDB, retrieves matching documents, sorts them by year (most recent first),
    and formats the results into a context string for an LLM.

    Args:
        collection: ChromaDB collection object.
        query (str): User's query.
        n_results (int): Number of results to retrieve.

    Returns:
        Tuple[List[Tuple[Dict, str]], str]: 
            - Sorted list of (metadata, document) tuples.
            - A formatted context string with sorted document summaries.
    """

    # Step 1: Get Company Matches
    company_matches = get_companies_from_query(query)
    doctype = get_doctype_from_query(query)

    # Step 2: Query ChromaDB
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where={
            "$and": [
                {"company_name": {"$in": company_matches}},
                {"file_type": {"$in": doctype}}
            ]
        }
    )

    # Step 2: Extract Metadata & Documents
    metadata_results = results.get('metadatas', [])
    document_results = results.get('documents', [])

    # Step 3: Flatten Nested Metadata & Document Lists
    flattened_metadata = [item for sublist in metadata_results for item in sublist]
    flattened_documents = [item for sublist in document_results for item in sublist]

    # Step 4: Sort Metadata & Documents by Year (Descending)
    sorted_results = sorted(
        zip(flattened_metadata, flattened_documents),
        key=lambda pair: int(pair[0]['year']),  # Convert 'year' to int for correct sorting
        reverse=True
    )

    # Step 5: Generate Context String from Sorted Documents
    context = "\n\n".join([doc for _, doc in sorted_results])

    return sorted_results, context  # Returns both structured data & formatted context

def qa_bot(query: str, contexts: str):
    """
    A function to answer questions about the financial document summaries.
    
    Args:
        query (str): The question to answer.
        contexts (str): The contexts of the financial document summaries.

    Returns:
        str: The answer to the question.
    """
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    
    # QA System Prompt
    qa_system_prompt = """
        You are an experienced financial analyst. Your primary task is to answer user questions about financial topics based *solely* on the content of the provided financial document summaries. Your goal is to provide not just factually accurate but also insightful responses that directly address the user's query by synthesizing information and identifying key relationships within the provided documents.

        **Strict Guidelines:**

        * **Insightful Analysis Based on Facts:** You must base your insights and analysis *exclusively* on the information explicitly stated or logically implied within the provided summaries. Aim to connect different pieces of information, identify trends, and explain the significance of the data in relation to the user's question.
        * **No Fabrication or External Information:** Under no circumstances should you make up information, invent scenarios, or bring in knowledge from outside the provided financial document summaries.
        * **Handling Questions Beyond the Summaries:**
            * If the answer to the user's question requires information or analysis not explicitly present or logically derivable from the provided summaries, respond with: "Based on the provided document summaries, I cannot offer a more detailed or insightful analysis on this specific aspect." Then guide the user on how to find the answer.
            * If the question is unrelated to the financial document summaries, respond with: "This question falls outside the scope of the provided financial document summaries, and therefore I cannot offer an insightful response."
        * **Handling Unclear Questions:** If the user's question is ambiguous or lacks sufficient detail to provide an insightful response, politely ask for clarification. For example: "To provide a more insightful analysis, could you please specify which aspect of [topic] you are most interested in?" or "Could you please provide more context regarding [specific element] so I can offer a more insightful perspective based on the documents?"

        **Focus:** Provide concise yet comprehensive answers that directly address the user's query with insights derived solely from the provided financial document summaries. Aim to explain the "why" behind the numbers and trends where the information allows, without making assumptions or introducing external data.
    """

    # Create a prompt for the question and context
    prompt = f"Question: {query}\nContext: {contexts}"

    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25",
        config=types.GenerateContentConfig(
            system_instruction=qa_system_prompt,
            temperature=0,
        ),
        contents=[prompt]
    )

    return response.text

In [23]:
query = "Who is the CEO of EduFocal?"

In [24]:
get_doctype_from_query("Who is the CEO of EduFocal?")

['non-financial']

In [25]:
get_companies_from_query("Who is the CEO of EduFocal?")

['EduFocal']

In [26]:
# Example Usage
sorted_results, context = query_chromadb_sorted(collection, query)

# Print formatted context
print("\n--- Context for LLM ---\n")
print(context)


--- Context for LLM ---

```markdown
### 📄 0. Document Overview
- **Document Type**: Press Release / Personnel Announcement
- **Company Name & Ticker (if public)**: EduFocal Limited (Ticker: LEARN)
- **Date of Submission / Release**: Effective April 24, 2023 (Implied release date around or after this)
- **Author / Division**: EduFocal Limited
- **Purpose of Document**: To announce the appointment of Mark Green as the new Chief Operations Officer (COO).

---

### 🏢 1. Company Overview
- **Company Name**: EduFocal Limited
- **Headquarters / Region**: Kingston 6, Jamaica
- **CEO / Key Executives**: Gordon Swaby (CEO), Mark Green (COO)
- **Board of Directors (if disclosed)**: Gordon Swaby, Peter Levy (Chairman), Lloyd Swaby, Kevin Donaldson, Grace Lindo, Shauna-Gaye Fuller
- **Business Segments / Focus Areas**: Education Technology ("making learning FUN"), cleverschoolteacher.com (EduFocal, LLC)

---

### 🧭 2. Strategic Themes & Objectives
- **Primary goals or initiatives discussed**: Enh

In [27]:
answer = qa_bot(query, context)
print(answer)

Based on the provided document summaries:

The CEO of EduFocal Limited is **Gordon Swaby**.

This information is explicitly stated in the "Company Overview" section of both the Press Release summary and the 2022 Annual Report summary. The Annual Report summary also notes that he is a Co-founder of the company.


In [29]:
def answer_found(answer, query):
    """
    Check if the answer to the query was found.
    
    Args:
        answer (str): The answer to the query.
        query (str): The query to answer.

    Returns:
        bool: True if the answer was found, False otherwise.
    """
    
    client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite-001",
        config=types.GenerateContentConfig(
            system_instruction="""
                Your job is to check if the answer to the query was found in the context.
                You will be given the query and the answer.
                Analyze the answer and the query and respond with "True" if the answer was found in the context, and "False" otherwise.
                
                Example:
                Query: "Who is the CEO of EduFocal?"
                Answer: "The CEO of EduFocal is John Doe."
                Response: "True"
                
                Example:
                Query: "Who is the CEO of EduFocal?"
                Answer: "Based on the provided document summaries, I cannot offer a more detailed or insightful analysis on this specific aspect."
                Response: "False"
                """,
            temperature=0,
        ),
        contents=[query, answer]
    )
    return response.text

answer_found(answer, query)

'True\n'

# Using BigQuery as the Vectorstore

In [20]:
# TODO : Set values as per your requirements

# set the environment variables
# After load_dotenv()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "service-account/credentials.json"

# Project and Storage Constants
PROJECT_ID = "jse-datasphere"
REGION = "us-central1"
BUCKET = "jse-findoc-bucket"
BUCKET_URI = f"gs://{BUCKET}"

# The number of dimensions for the textembedding-gecko@003 is 768
# If other embedder is used, the dimensions would probably need to change.
DIMENSIONS = 768

# Index Constants
DISPLAY_NAME = "jse-findoc-index"
DEPLOYED_INDEX_ID = "JseFindocIndex"

In [9]:
# Create a bucket.
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Creating gs://jse-findoc-bucket/...


In [15]:
from google.oauth2 import service_account
from google.cloud import aiplatform

print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))

# Load the credentials
credentials = service_account.Credentials.from_service_account_file(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))

aiplatform.init(
    project=PROJECT_ID, 
    location=REGION, 
    staging_bucket=BUCKET_URI, 
    credentials=credentials)

service-account/credentials.json


In [24]:

# NOTE : This operation can take upto 30 seconds
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    index_update_method="STREAM_UPDATE",  # allowed values BATCH_UPDATE , STREAM_UPDATE
    project=PROJECT_ID,
    location=REGION,
    credentials=credentials,
    shard_size="SHARD_SIZE_SMALL"
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1069196447797/locations/us-central1/indexes/1623620233434497024/operations/8115320055589240832
MatchingEngineIndex created. Resource name: projects/1069196447797/locations/us-central1/indexes/1623620233434497024
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1069196447797/locations/us-central1/indexes/1623620233434497024')


In [25]:
# Create an endpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"{DISPLAY_NAME}-endpoint", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456/operations/5102834117343444992
MatchingEngineIndexEndpoint created. Resource name: projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456')


In [27]:
# NOTE : This operation can take upto 20 minutes
my_index_endpoint = my_index_endpoint.deploy_index(
    index=my_index, 
    deployed_index_id=DEPLOYED_INDEX_ID, 
    min_replica_count=1,
)

my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456/operations/65557934129545216
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/1069196447797/locations/us-central1/indexEndpoints/8587797141395603456


[id: "JseFindocIndex"
index: "projects/1069196447797/locations/us-central1/indexes/1623620233434497024"
create_time {
  seconds: 1744669366
  nanos: 443538000
}
index_sync_time {
  seconds: 1744670911
  nanos: 58090000
}
automatic_resources {
  min_replica_count: 1
  max_replica_count: 1
}
deployment_group: "default"
]

In [51]:
!python populate_vertex_search.py

Starting to populate Vertex AI Search...
Error uploading document edufocal_limited-learn-management_appointments_resignations_and_retirements-2023-04-28: 400 Request contains an invalid argument.
Error uploading document EduFocal-Limited-2023-Financial-Statements: 400 Request contains an invalid argument.
^C
Traceback (most recent call last):
  File "/Users/galbraithelroy/Documents/jse-datasphere-chatbot/populate_vertex_search.py", line 30, in <module>
    populate_vertex_search(
  File "/Users/galbraithelroy/Documents/jse-datasphere-chatbot/vertex_search_utils.py", line 135, in populate_vertex_search
    upload_to_vertex_search(
  File "/Users/galbraithelroy/Documents/jse-datasphere-chatbot/vertex_search_utils.py", line 57, in upload_to_vertex_search
    embedding = embed_document(content)
  File "/Users/galbraithelroy/Documents/jse-datasphere-chatbot/vertex_search_utils.py", line 22, in embed_document
    model = TextEmbeddingModel.from_pretrained(model_name)
  File "/Users/galbraith

In [37]:
from google import genai
from google.genai.types import GenerateContentConfig, HttpOptions
from google.cloud import discoveryengine_v1beta
from google.cloud.discoveryengine_v1beta import SearchRequest
import os

# Initialize the Gemini client
client = genai.Client(http_options=HttpOptions(api_version="v1"), api_key=os.getenv("CHATBOT_API_KEY"))

# Load Data Store ID from Vertex AI Search
DATASTORE_ID = "jse-findoc-summaries_1744670608035"
datastore = f"projects/{PROJECT_ID}/locations/global/collections/default_collection/dataStores/{DATASTORE_ID}"

# Initialize the Vertex AI Search client
search_client = discoveryengine_v1beta.SearchServiceClient()

# Create a search request
request = SearchRequest(
    serving_config=f"{datastore}/servingConfigs/default_config",
    query="Who is the CEO of EduFocal?",
    page_size=5
)

# Execute the search
search_response = search_client.search(request)

# Extract the search results
search_results = []
for result in search_response.results:
    search_results.append(result.document.struct_data)

# Format the search results as context
context = "\n\n".join([str(result) for result in search_results])

# Use Gemini to generate a response based on the search results
response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    contents=f"Based on the following context, answer the question: Who is the CEO of EduFocal?\n\nContext:\n{context}",
    config=GenerateContentConfig(
        temperature=0,
    ),
)

print(response.text)

Without any context, I cannot determine who the CEO of EduFocal is.



In [13]:
from google import genai
model_name = "text-embedding-005"
client = genai.Client(api_key=os.getenv("CHATBOT_API_KEY"))

In [None]:
from google.genai.types import EmbedContentConfig

client = genai.Client()
response = client.models.embed_content(
    model="text-embedding-005",
    contents=[
        "How do I get a driver's license/learner's permit?",
        "How do I renew my driver's license?",
        "How do I change my address on my driver's license?",
    ],
    config=EmbedContentConfig(
        task_type="RETRIEVAL_DOCUMENT",  # Optional
        output_dimensionality=768,  # Optional
        title="Driver's License",  # Optional
    ),
)

In [52]:
import os
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# --- Configuration ---
PROJECT_ID = "jse-datasphere"  # Your Google Cloud project ID
LOCATION = "us-central1"      # The region for Vertex AI endpoint.
MODEL_NAME = "text-embedding-005" # Or "textembedding-gecko@003", "text-multilingual-embedding-002", etc.
TEXT_FILE_PATH = "pdfs/summaries/ONE-2023-Audited-Financial-Statement-Year-Ended-31-August-2023.txt" # Path to your local text file
# --- End Configuration ---

def generate_embedding_from_file(project_id: str, location: str, model_name: str, file_path: str):
    """Generates text embeddings from a local text file using Vertex AI."""

    print(f"Initializing Vertex AI client for project {project_id} in {location}...")
    aiplatform.init(project=project_id, location=location)

    try:
        print(f"Reading text content from: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            text_content = f.read()

        # Check if the text content is empty
        if not text_content.strip():
             print("Error: Text file is empty or contains only whitespace.")
             return None

        print(f"Loading embedding model: {model_name}")
        # Use the specific model class if available and preferred
        # from vertexai.language_models import TextEmbeddingModel
        # model = TextEmbeddingModel.from_pretrained(model_name)
        # embeddings = model.get_embeddings([text_content])

        # Or use the general Endpoint approach (good for different model types)
        # Note: Endpoint approach might require finding the specific endpoint ID
        # Let's use the TextEmbeddingModel class for clarity with foundation models
        from vertexai.language_models import TextEmbeddingModel

        model = TextEmbeddingModel.from_pretrained(model_name)

        print("Generating embedding...")
        # The get_embeddings method expects a list of texts
        embeddings_response = model.get_embeddings([text_content])

        # Check for errors (though SDK might raise exceptions)
        # Handle potential errors if the response structure includes status
        # For TextEmbeddingModel, errors usually raise exceptions handled below.

        # Extract the embedding vector
        # The response is a list of TextEmbedding objects
        if embeddings_response and len(embeddings_response) > 0:
            embedding_vector = embeddings_response[0].values
            print(f"Successfully generated embedding vector (length: {len(embedding_vector)})")
            # print("Embedding vector (first 10 values):", embedding_vector[:10]) # Optionally print part of it
            return embedding_vector
        else:
            print("Error: No embedding vector received.")
            return None

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# --- Main execution --

In [53]:
vector = generate_embedding_from_file(PROJECT_ID, LOCATION, MODEL_NAME, TEXT_FILE_PATH)

Initializing Vertex AI client for project jse-datasphere in us-central1...
Reading text content from: pdfs/summaries/ONE-2023-Audited-Financial-Statement-Year-Ended-31-August-2023.txt
Loading embedding model: text-embedding-005
Generating embedding...
Successfully generated embedding vector (length: 768)


https://cloud.google.com/bigquery/docs/vector-index-text-search-tutorial