# 0. Preamble

## 0.1. Connect to the Google shared folder "Capstone"

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd '/content/drive/My Drive/Capstone/FinSightAI_Final'

/content/drive/My Drive/Capstone


In [None]:
ls

 [0m[01;34mchroma[0m/         [01;34meval[0m/                       [01;34m{persist_base_directory}vector_store_rcts_1500_small[0m/
 [01;34mchroma_v2[0m/      final_eval_details.csv      [01;34mpickle_files[0m/
 [01;34mcredentials[0m/    final_eval_details.gsheet   response_eval_accuracy.csv
 [01;34mdatasets[0m/      [01;34m'GitHub related'[0m/            response_eval_details.csv
 [01;34mDeliverables[0m/   [01;34mNotebooks[0m/


## 0.2. Install Dependencies

please install langchain required libraries: reference documentation  https://python.langchain.com/docs/how_to/installation/
<br> you will need your Open AI API key  https://platform.openai.com/api-keys


In [None]:
!pip install langchain
!pip install langchain-community
!pip install langchain-experimental
!pip install pypdf
!pip install unstructured
!pip install lxml html5lib
!pip install langchain-openai
!pip install chromadb
!pip install openai
!pip install -U langchain-chroma
!pip install colbert
!pip install groq
!pip install --upgrade langchain openai
!pip install transformers -U



# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
# tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.0 which is incompatible.
# tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.0 which is incompatible.

# Dependency Conflict: tensorflow and tensorflow-metadata require a specific version of protobuf (<5 and >=3.20.3), but the version installed (protobuf 5.29.0) does not meet this requirement.
# Successful Installation: Despite the warning, chromadb and its dependencies were installed successfully.

Collecting langchain-community
  Downloading langchain_community-0.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.10 (from langchain-community)
  Downloading langchain-0.3.10-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.22 (from langchain-community)
  Downloading langchain_core-0.3.22-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

## 0.3. Imports

In [None]:
from dotenv import load_dotenv
import os
import time
import re
import math
import pickle
import requests
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gspread
import shutil
from sklearn.preprocessing import normalize

import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig

import openai

from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain.embeddings.base import Embeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from langchain.schema import SystemMessage, HumanMessage

import groq
from groq import Client, Groq

# from google.colab import userdata

## 0.4. API Keys

<font color=red>delete below cell when share</font>

In [None]:

# replace with your credentials
env_content = """
GROQ_API_KEY=
OPENAI_API_KEY=
HF_TOKEN=
"""

with open('credentials/claire.env', 'w') as f:   # replace it with your env filename
    f.write(env_content)

In [None]:
load_dotenv('credentials/claire.env', override=True)  # replace it with your env filename

groq_api_key = os.getenv('GROQ_API_KEY')
huggingface_key = os.getenv('HF_TOKEN')
openai_api_key = os.getenv('OPENAI_API_KEY')

<br>

# 1. Loading Documents


### <font color='red'> For evaluation or deployment, SKIP this section to avoid repeating loading, chunking and embedding steps.</font>

## 1.1. Load Documents

#### <font color='purple'> Skip this sub-section and directly load pickled all_documents to save time.</font>

In [None]:
html_base_folder = os.path.expanduser("sec_filings")
pdf_base_folder = os.path.expanduser("news")

# Function to extract a date from a file name
def extract_date_from_filename(file_name):
    date_pattern = re.compile(r'(\d{4})-(\d{1,2})-(\d{1,2})')  # YYYY-MM-DD format
    match = date_pattern.search(file_name)
    if match:
        year, month, day = match.groups()
        return f"{year}-{int(month):02d}-{int(day):02d}"  # Ensure zero-padded month and day
    return None

# Function to extract document type based on file extension and name patterns
def extract_doc_type(file_name, file_type):
    if file_type == "html":
        if file_name.endswith("10-Q.html"):
            return "10-Q"
        elif file_name.endswith("10-K.html"):
            return "10-K"
        elif file_name.endswith("8-K.html"):
            return "8-K"
    elif file_type == "pdf":
        if "News" in file_name:
            return "News"
        elif "Company news" in file_name:
            return "Company News"
        elif "annual report" in file_name:
            return "Annual Report"
        elif "Press release" in file_name:
            return "Press Release"
        elif "Company presentation" in file_name:
            return "Company Presentation"
    return "Unknown"

# Unified function to load documents from a folder
def load_documents(base_folder, file_extension, loader_class, file_type):
    documents = []
    for root, _, files in os.walk(base_folder):
        for file_name in files:
            if file_name.endswith(f".{file_extension}"):
                file_path = os.path.join(root, file_name)
                loader = loader_class(file_path)
                data = loader.load()

                # Add metadata to each document
                document_date = extract_date_from_filename(file_name)
                doc_type = extract_doc_type(file_name, file_type)
                for doc in data:
                    doc.metadata["source"] = file_path
                    doc.metadata["date"] = document_date if document_date else "Unknown"
                    doc.metadata["document_type"] = doc_type

                documents.extend(data)
    return documents

# Load documents from HTML and PDF folders
html_documents = load_documents(html_base_folder, "html", UnstructuredHTMLLoader, "html")
pdf_documents = load_documents(pdf_base_folder, "pdf", PyPDFLoader, "pdf")

# Combine all documents
all_documents = html_documents + pdf_documents

In [None]:
# Metata verification

sample = [115, 128, 140]  # Example indices for three documents

for i, index in enumerate(sample, start=1):
    sample_doc = all_documents[index]
    print(f"Sample Document Metadata #{i}:")
    print(f"  Metadata: {sample_doc.metadata}")
    print(f"  Source: {sample_doc.metadata.get('source', 'Unknown')}")
    print(f"  Document Type: {sample_doc.metadata.get('document_type', 'Unknown')}")
    print(f"  Date: {sample_doc.metadata.get('date', 'Unknown')}")

Sample Document Metadata #1:
  Metadata: {'source': 'sec_filings/microsoft_filings/MSFT_789019_2023-07-03_8-K.html', 'date': '2023-07-03', 'document_type': '8-K'}
  Source: sec_filings/microsoft_filings/MSFT_789019_2023-07-03_8-K.html
  Document Type: 8-K
  Date: 2023-07-03
Sample Document Metadata #2:
  Metadata: {'source': 'sec_filings/tesla_filings/TSLA_1318605_2024-07-23_8-K.html', 'date': '2024-07-23', 'document_type': '8-K'}
  Source: sec_filings/tesla_filings/TSLA_1318605_2024-07-23_8-K.html
  Document Type: 8-K
  Date: 2024-07-23
Sample Document Metadata #3:
  Metadata: {'source': 'sec_filings/tesla_filings/TSLA_1318605_2023-10-02_8-K.html', 'date': '2023-10-02', 'document_type': '8-K'}
  Source: sec_filings/tesla_filings/TSLA_1318605_2023-10-02_8-K.html
  Document Type: 8-K
  Date: 2023-10-02


In [None]:
# Save all_documents to a pickle file
pickle_file_path = "datasets/pickle_files/all_documents.pkl"
with open(pickle_file_path, "wb") as f:
    pickle.dump(all_documents, f)

## 1.2. Load all_documents from pickle

In [None]:
# Load all_documents from the pickle file
pickle_file_path = "datasets/pickle_files/all_documents.pkl"
try:
    with open(pickle_file_path, "rb") as f:
        all_documents = pickle.load(f)
    print(f"Loaded {len(all_documents)} documents from {pickle_file_path}")
except FileNotFoundError:
    print(f"Pickle file {pickle_file_path} not found. Run the document loading process first.")


Loaded 1782 documents from datasets/pickle_files/all_documents.pkl


In [None]:
# Check loaded documents

print(f"Total documents loaded: {len(all_documents)}")
if all_documents:
    print("Sample document metadata:", all_documents[0].metadata)
    print("Sample document content:", all_documents[0].page_content[:500])

Total documents loaded: 1782
Sample document metadata: {'source': 'sec_filings/meta_filings/META_1326801_2024-10-31_10-Q.html', 'date': '2024-10-31', 'document_type': '10-Q'}
Sample document content: UNITED STATES

SECURITIES AND EXCHANGE COMMISSION

Washington, D.C. 20549

____________________________________________

FORM

____________________________________________

(Mark One)

QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the quarterly period ended

or

TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from to

Commission File Number:

________________________________________
