In [1]:
import os
from getpass import getpass
import openai

# Setup your Openai API key
if os.getenv("OPENAI_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
  openai.api_key = os.getenv("OPENAI_API_KEY", "")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

Please enter password in the VS Code prompt at the top of your VS Code window!
OpenAI API key configured


In [2]:
from pydantic import BaseModel, Field

class Person(BaseModel):
    name: str
    age: int

class Address(BaseModel):
    address: str = Field(description="Full street address")
    city: str
    state: str


class PersonAddress(Person):
    """A Person with an address"""

    address: Address


PersonAddress.model_json_schema()

{'$defs': {'Address': {'properties': {'address': {'description': 'Full street address',
     'title': 'Address',
     'type': 'string'},
    'city': {'title': 'City', 'type': 'string'},
    'state': {'title': 'State', 'type': 'string'}},
   'required': ['address', 'city', 'state'],
   'title': 'Address',
   'type': 'object'}},
 'description': 'A Person with an address',
 'properties': {'name': {'title': 'Name', 'type': 'string'},
  'age': {'title': 'Age', 'type': 'integer'},
  'address': {'$ref': '#/$defs/Address'}},
 'required': ['name', 'age', 'address'],
 'title': 'PersonAddress',
 'type': 'object'}

## AIM OF THE PROJECT
1. Given a question, figure out what ticker, year, quarter, form type it is talking about
2. Collect this metadata and search from vector database from relevant ticker(s) 
3. RAG application
4. Add a LLM Validator to ask questions related to finance only

In [3]:
from src.vectorDatabase import create_database

# qdrant_client,speakers_list_1,speakers_list_2,speakers_list_3,speakers_list_4,sec_form_names,earnings_call_quarter_vals, = create_database("AAPL",2023)

[nltk_data] Downloading package punkt to /home/athekunal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import instructor

from openai import OpenAI
from typing import List
from pydantic import BaseModel, Field

client = instructor.patch(OpenAI())

In [5]:
from typing import List, Literal
from enum import Enum

class Source(Enum):
    sec_quarter_1 = "10-Q1"

class TickerYearQuarter(BaseModel):
    chain_of_thought: str = Field(
        description="Think step by step to output what is the ticker symbols, NOT THE COMPANY NAME, quarter, year and data source the question is talking about"
    )
    ticker:List[str] 
    year: List[str] = Field(description="The year that the question is talking about")
    quarter: List[str] = Field(description="The quarter number that the question is talking about. Make sure that it starts with Q, for example Quarter 4 is Q4")
    data_source: Literal["CALLS","SEC"] = Field(description="If the question is talking about SEC filings then output SEC, else if the question is talking about Earning calls transcript then output CALLS")

class Query(BaseModel):
    rewritten_query: str = Field(description="Rewrite the query and DON'T include the company name, years, quarters and data sources")
    question_ticker_quarter_year: TickerYearQuarter

In [6]:
from datetime import datetime
import pandas as pd

def expand_query(q) -> Query:
    datetime_obj =  datetime.today().strftime("%Y-%m-%d")
    quarter = pd.Timestamp(datetime_obj).quarter
    year = pd.Timestamp(datetime_obj).year
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.0,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for SEC Filings and Earnings Call. The current year is {year} and quarter {quarter}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )


# query = expand_query("What did Apple, Nvidia do for Generative AI in 2023 quarter 3, 2 and 1 from earnings call")
query = expand_query("What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call")

In [7]:
query

Query(rewritten_query='Generative AI initiatives in Q4 2023 for Apple and Amazon from earnings call', question_ticker_quarter_year=TickerYearQuarter(chain_of_thought='Generative AI initiatives in Q4 2023', ticker=['AAPL', 'AMZN'], year=['2023'], quarter=['Q4'], data_source='CALLS'))

In [8]:
query.question_ticker_quarter_year.year

['2023']

In [9]:
query.question_ticker_quarter_year.quarter

['Q4']

In [10]:
tickers = query.question_ticker_quarter_year.ticker
years = query.question_ticker_quarter_year.year
quarter = query.question_ticker_quarter_year.quarter


In [11]:
# tic = 'AAPL'
# var_name = f"qdrant_client_{tic}"
# globals()[var_name] = qdrant_client

In [12]:
# qdrant_client_AAPL

In [13]:
tic_yr_dict = {}
for tic in tickers:
    for yr in years:
        print(f"Building vector database for {tic} and year {yr}")
        qdrant_client,speakers_list_1,speakers_list_2,speakers_list_3,speakers_list_4,sec_form_names,earnings_call_quarter_vals = create_database(tic,yr)
        qd_client_var_name = f"qdrant_client_{tic}"
        tic_yr_dict[qd_client_var_name] = qdrant_client
        speakers_list_1_var_name = f"speakers_list_1_{tic}"
        tic_yr_dict[speakers_list_1_var_name] = speakers_list_1
        speakers_list_2_var_name = f"speakers_list_2_{tic}"
        tic_yr_dict[speakers_list_2_var_name] = speakers_list_2
        speakers_list_3_var_name = f"speakers_list_3_{tic}"
        tic_yr_dict[speakers_list_3_var_name] = speakers_list_3
        speakers_list_4_var_name = f"speakers_list_4_{tic}"
        tic_yr_dict[speakers_list_4_var_name] = speakers_list_4
        sec_form_names_var_name = f"sec_form_names_{tic}"
        tic_yr_dict[sec_form_names_var_name] = sec_form_names
        earnings_call_quarter_vals_var_name = f"earnings_call_quarter_vals_{tic}"
        tic_yr_dict[earnings_call_quarter_vals_var_name] = earnings_call_quarter_vals_var_name
    print(f"Done for {tic} and year {yr}")

Building vector database for AAPL and year 2023


  return torch._C._cuda_getDeviceCount() > 0


Done for AAPL and year 2023
Building vector database for AMZN and year 2023
Done for AMZN and year 2023


In [14]:
from sentence_transformers import SentenceTransformer
from src.config import *
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
encoder = SentenceTransformer(
        ENCODER_NAME, device=device, trust_remote_code=True
    )

In [15]:
query 

Query(rewritten_query='Generative AI initiatives in Q4 2023 for Apple and Amazon from earnings call', question_ticker_quarter_year=TickerYearQuarter(chain_of_thought='Generative AI initiatives in Q4 2023', ticker=['AAPL', 'AMZN'], year=['2023'], quarter=['Q4'], data_source='CALLS'))

In [27]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            print(q)
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speakers_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
        else:
            for q in quarter:
                search_form = "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            