In [1]:
import os
from getpass import getpass
import openai

# Setup your Openai API key
if os.getenv("OPENAI_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
  openai.api_key = os.getenv("OPENAI_API_KEY", "")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

In [2]:
from pydantic import BaseModel, Field

class Person(BaseModel):
    name: str
    age: int

class Address(BaseModel):
    address: str = Field(description="Full street address")
    city: str
    state: str


class PersonAddress(Person):
    """A Person with an address"""

    address: Address


PersonAddress.model_json_schema()

{'$defs': {'Address': {'properties': {'address': {'description': 'Full street address',
     'title': 'Address',
     'type': 'string'},
    'city': {'title': 'City', 'type': 'string'},
    'state': {'title': 'State', 'type': 'string'}},
   'required': ['address', 'city', 'state'],
   'title': 'Address',
   'type': 'object'}},
 'description': 'A Person with an address',
 'properties': {'name': {'title': 'Name', 'type': 'string'},
  'age': {'title': 'Age', 'type': 'integer'},
  'address': {'$ref': '#/$defs/Address'}},
 'required': ['name', 'age', 'address'],
 'title': 'PersonAddress',
 'type': 'object'}

In [3]:
from src.vectorDatabase import create_database

# qdrant_client,speakers_list_1,speakers_list_2,speakers_list_3,speakers_list_4,sec_form_names,earnings_call_quarter_vals, = create_database("AAPL",2023)

In [4]:
import instructor

from openai import OpenAI
from typing import List
from pydantic import BaseModel, Field

client = instructor.patch(OpenAI())

In [5]:
from typing import List, Literal
from enum import Enum

class Source(Enum):
    sec_quarter_1 = "10-Q1"

class TickerYearQuarter(BaseModel):
    chain_of_thought: str = Field(
        description="Think step by step to output what is the ticker symbols, NOT THE COMPANY NAME, quarter, year and data source the question is talking about"
    )
    ticker:List[str] 
    year: List[str] = Field(description="The year that the question is talking about")
    quarter: List[str] = Field(description="The quarter number that the question is talking about. Make sure that it starts with Q, for example Quarter 4 is Q4")
    data_source: Literal["CALLS","SEC"] = Field(description="If the question is talking about SEC filings then output SEC, else if the question is talking about Earning calls transcript then output CALLS")

class Query(BaseModel):
    rewritten_query: str = Field(description="Rewrite the query and DON'T include the company name, years, quarters and data sources")
    question_ticker_quarter_year: TickerYearQuarter

In [6]:
from datetime import datetime
import pandas as pd

def expand_query(q) -> Query:
    datetime_obj =  datetime.today().strftime("%Y-%m-%d")
    quarter = pd.Timestamp(datetime_obj).quarter
    year = pd.Timestamp(datetime_obj).year
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.0,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for SEC Filings and Earnings Call. The current year is {year} and quarter {quarter}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )


# query = expand_query("What did Apple, Nvidia do for Generative AI in 2023 quarter 3, 2 and 1 from earnings call")
query = expand_query("What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call")

In [7]:
query

Query(rewritten_query='Generative AI initiatives in Q4 2023 for Apple and Amazon from earnings call', question_ticker_quarter_year=TickerYearQuarter(chain_of_thought='Generative AI initiatives in Q4 2023', ticker=['AAPL', 'AMZN'], year=['2023'], quarter=['Q4'], data_source='CALLS'))

In [8]:
query.question_ticker_quarter_year.year

['2023']

In [9]:
query.question_ticker_quarter_year.quarter

['Q4']

In [10]:
tickers = query.question_ticker_quarter_year.ticker
years = query.question_ticker_quarter_year.year
quarter = query.question_ticker_quarter_year.quarter

In [11]:
# tic = 'AAPL'
# var_name = f"qdrant_client_{tic}"
# globals()[var_name] = qdrant_client

In [12]:
# qdrant_client_AAPL

In [13]:
tic_yr_dict = {}
for tic in tickers:
    for yr in years:
        print(f"Building vector database for {tic} and year {yr}")
        qdrant_client,speakers_list_1,speakers_list_2,speakers_list_3,speakers_list_4,sec_form_names,earnings_call_quarter_vals = create_database(tic,yr)
        qd_client_var_name = f"qdrant_client_{tic}"
        tic_yr_dict[qd_client_var_name] = qdrant_client
        speakers_list_1_var_name = f"speakers_list_1_{tic}"
        tic_yr_dict[speakers_list_1_var_name] = speakers_list_1
        speakers_list_2_var_name = f"speakers_list_2_{tic}"
        tic_yr_dict[speakers_list_2_var_name] = speakers_list_2
        speakers_list_3_var_name = f"speakers_list_3_{tic}"
        tic_yr_dict[speakers_list_3_var_name] = speakers_list_3
        speakers_list_4_var_name = f"speakers_list_4_{tic}"
        tic_yr_dict[speakers_list_4_var_name] = speakers_list_4
        sec_form_names_var_name = f"sec_form_names_{tic}"
        tic_yr_dict[sec_form_names_var_name] = sec_form_names
        earnings_call_quarter_vals_var_name = f"earnings_call_quarter_vals_{tic}"
        tic_yr_dict[earnings_call_quarter_vals_var_name] = earnings_call_quarter_vals_var_name
    print(f"Done for {tic} and year {yr}")

In [14]:
from sentence_transformers import SentenceTransformer
from src.config import *
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
encoder = SentenceTransformer(
        ENCODER_NAME, device=device, trust_remote_code=True
    )

In [15]:
query 

Query(rewritten_query='Generative AI initiatives in Q4 2023 for Apple and Amazon from earnings call', question_ticker_quarter_year=TickerYearQuarter(chain_of_thought='Generative AI initiatives in Q4 2023', ticker=['AAPL', 'AMZN'], year=['2023'], quarter=['Q4'], data_source='CALLS'))

In [16]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
vrelevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speaker_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)

In [17]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
vrelevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speaker_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [18]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speaker_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [19]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            print(q)
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speaker_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [20]:
tic_yr_dict

{'qdrant_client_AAPL': <qdrant_client.qdrant_client.QdrantClient at 0x7fba8daae8d0>,
 'speakers_list_1_AAPL': ['Tejas Gala',
  'Timothy Cook',
  'Luca Maestri',
  'Tejas Gala',
  'Operator',
  'David Vogt',
  'Timothy Cook',
  'Operator',
  'Shannon Cross',
  'Luca Maestri',
  'Shannon Cross',
  'Timothy Cook',
  'Operator',
  'Erik Woodring',
  'Timothy Cook',
  'Erik Woodring',
  'Timothy Cook',
  'Operator',
  'Aaron Rakers',
  'Luca Maestri',
  'Aaron Rakers',
  'Timothy Cook',
  'Operator',
  'Amit Daryanani',
  'Timothy Cook',
  'Amit Daryanani',
  'Luca Maestri',
  'Operator',
  'Harsh Kumar',
  'Timothy Cook',
  'Harsh Kumar',
  'Timothy Cook',
  'Operator',
  'Wamsi Mohan',
  'Timothy Cook',
  'Wamsi Mohan',
  'Timothy Cook',
  'Wamsi Mohan',
  'Luca Maestri',
  'Operator',
  'Tejas Gala',
  'Operator',
  'James Suva',
  'Timothy Cook',
  'James Suva',
  'Luca Maestri',
  'Operator',
  'Krish Sankar',
  'Timothy Cook',
  'Krish Sankar',
  'Timothy Cook',
  'Tejas Gala',
  'Ope

In [21]:
tickers

['AAPL', 'AMZN']

In [22]:
tic_yr_dict[f"speakers_list_4_AAPL"]

['Suhasini Chandramouli',
 'Tim Cook',
 'Luca Maestri',
 'Suhasini Chandramouli',
 'Operator',
 'Michael Ng',
 'Tim Cook',
 'Michael Ng',
 'Tim Cook',
 'Michael Ng',
 'Tim Cook',
 'Suhasini Chandramouli',
 'Operator',
 'Aaron Rakers',
 'Tim Cook',
 'Aaron Rakers',
 'Luca Maestri',
 'Aaron Rakers',
 'Suhasini Chandramouli',
 'Operator',
 'Erik Woodring',
 'Luca Maestri',
 'Erik Woodring',
 'Luca Maestri',
 'Erik Woodring',
 'Suhasini Chandramouli',
 'Operator',
 'David Vogt',
 'Tim Cook',
 'Luca Maestri',
 'David Vogt',
 'Luca Maestri',
 'David Vogt',
 'Luca Maestri',
 'David Vogt',
 'Suhasini Chandramouli',
 'Operator',
 'Amit Daryanani',
 'Luca Maestri',
 'Amit Daryanani',
 'Tim Cook',
 'Amit Daryanani',
 'Suhasini Chandramouli',
 'Operator',
 'Harsh Kumar',
 'Tim Cook',
 'Harsh Kumar',
 'Tim Cook',
 'Harsh Kumar',
 'Tim Cook',
 'Suhasini Chandramouli',
 'Operator',
 'Wamsi Mohan',
 'Tim Cook',
 'Wamsi Mohan',
 'Luca Maestri',
 'Wamsi Mohan',
 'Suhasini Chandramouli',
 'Operator',
 'K

In [23]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    if query.question_ticker_quarter_year.data_source == "CALLS":
        for q in quarter:
            print(q)
            if q == "Q1":
                speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
            elif q == "Q2":
                speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
            elif q == "Q3":
                speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
            elif q == "Q4":
                speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            
            relevant_text += f"For {tic} and Quarter {q}\n"
            relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speakers_list)
    if query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [24]:
relevant_text

"For AAPL and Quarter Q4\nTim Cook: we can make for our customers and keenly determined to push the limits of technology even further. And that's why I'm so confident that Apple's future is bright. With that, I'll turn it over to Luca.phone like fall detection, crash detection, ECG on the watch. These would not be possible without AI. And so, we don't label them as such, if you will. We label them as to what their consumer benefit is. But at the fundamental technology behind it is AI and machine learning. In terms of generative AI, we have -- obviously, we have work going on. I'm not going to get into details about what it is, because -- as you know, we don't -- we really don't do that. But you can bet that we're investing, we're\n\nSuhasini Chandramouli: Thank you. Good afternoon, and thank you for joining us. Speaking first today is Apple's CEO, Tim Cook. And he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of t

In [25]:
print(relevant_text)

In [26]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    # if query.question_ticker_quarter_year.data_source == "CALLS":
    #     for q in quarter:
    #         print(q)
    #         if q == "Q1":
    #             speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
    #         elif q == "Q2":
    #             speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
    #         elif q == "Q3":
    #             speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
    #         elif q == "Q4":
    #             speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            
    #         relevant_text += f"For {tic} and Quarter {q}\n"
    #         relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speakers_list)
    if True or query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form += "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [27]:
from src.queryDatabase import query_database_earnings_call, query_database_sec

question = query.rewritten_query
relevant_text = ""
for tic in tickers:
    # if query.question_ticker_quarter_year.data_source == "CALLS":
    #     for q in quarter:
    #         print(q)
    #         if q == "Q1":
    #             speakers_list = tic_yr_dict[f"speakers_list_1_{tic}"]
    #         elif q == "Q2":
    #             speakers_list = tic_yr_dict[f"speakers_list_2_{tic}"]
    #         elif q == "Q3":
    #             speakers_list = tic_yr_dict[f"speakers_list_3_{tic}"]
    #         elif q == "Q4":
    #             speakers_list = tic_yr_dict[f"speakers_list_4_{tic}"]
            
    #         relevant_text += f"For {tic} and Quarter {q}\n"
    #         relevant_text += query_database_earnings_call(question,q,tic_yr_dict[f"qdrant_client_{tic}"],encoder,speakers_list)
    if True or query.question_ticker_quarter_year.data_source == "SEC":
        if quarter == [""]:
            search_form = "10-K"
            relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            # return relevant_text
        else:
            for q in quarter:
                search_form = "10-"+q
                relevant_text += f"For ticker {tic} and Quarter {q}\n"
                relevant_text += query_database_sec(question, tic_yr_dict[f"qdrant_client_{tic}"], encoder, search_form)
            

In [28]:
print(relevant_text)

In [29]:
# We'll use a different client for async calls
# To highlight the difference and how we can use both
aclient = instructor.patch(AsyncOpenAI())


async def expand_query(
    q, *, model: str = "gpt-3.5-turbo", temp: float = 0
) -> Query:
    return await aclient.chat.completions.create(
        model=model,
        temperature=temp,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )

In [30]:
from openai import AsyncOpenAI

# We'll use a different client for async calls
# To highlight the difference and how we can use both
aclient = instructor.patch(AsyncOpenAI())


async def expand_query(
    q, *, model: str = "gpt-3.5-turbo", temp: float = 0
) -> Query:
    return await aclient.chat.completions.create(
        model=model,
        temperature=temp,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for the Metafor Systems search engine. Today is {date.today()}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )

In [31]:
from openai import AsyncOpenAI
from typing import List, Literal
from enum import Enum
from datetime import datetime
import pandas as pd
# We'll use a different client for async calls
# To highlight the difference and how we can use both
aclient = instructor.patch(AsyncOpenAI())

class TickerYearQuarter(BaseModel):
    chain_of_thought: str = Field(
        description="Think step by step to output what is the ticker symbols, NOT THE COMPANY NAME, quarter, year and data source the question is talking about"
    )
    ticker:List[str] 
    year: List[str] = Field(description="The year that the question is talking about")
    quarter: List[str] = Field(description="The quarter number that the question is talking about. Make sure that it starts with Q, for example Quarter 4 is Q4")
    data_source: Literal["CALLS","SEC"] = Field(description="If the question is talking about SEC filings then output SEC, else if the question is talking about Earning calls transcript then output CALLS")

class Query(BaseModel):
    rewritten_query: str = Field(description="Rewrite the query and DON'T include the company name, years, quarters and data sources")
    question_ticker_quarter_year: TickerYearQuarter

async def expand_query(q) -> Query:
    datetime_obj =  datetime.today().strftime("%Y-%m-%d")
    quarter = pd.Timestamp(datetime_obj).quarter
    year = pd.Timestamp(datetime_obj).year
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.0,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for SEC Filings and Earnings Call. The current year is {year} and quarter {quarter}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )

In [32]:
import asyncio
import time
import pandas as pd
import wandb
from src.helpers import *

model = "gpt-3.5-turbo"
temp = 0

run = wandb.init(
    project="query",
    config={"model": model, "temp": temp},
)

test_queries = [
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call?",
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 and quarter 3?",
    "Compare the total revenue generated by Apple and Amazon for the year 2023 and quarter 4",
]
start = time.perf_counter()
queries = await asyncio.gather(
    *[expand_query(q, model=model, temp=temp) for q in test_queries]
)
duration = time.perf_counter() - start

with open("schema.json", "w+") as f:
    schema = Query.model_json_schema()
    json.dump(schema, f, indent=2)

with open("results.jsonlines", "w+") as f:
    for query in queries:
        f.write(query.model_dump_json() + "\n")

df = dicts_to_df([q.report() for q in queries])
df["input"] = test_queries
df.to_csv("results.csv")


run.log({"schema": wandb.Table(dataframe=pd.DataFrame([{"schema": schema}]))})

run.log(
    {
        "usage_total_tokens": df["usage_total_tokens"].sum(),
        "usage_completion_tokens": df["usage_completion_tokens"].sum(),
        "usage_prompt_tokens": df["usage_prompt_tokens"].sum(),
        "duration (s)": duration,
        "average duration (s)": duration / len(queries),
        "n_queries": len(queries),
    }
)


run.log(
    {
        "results": wandb.Table(dataframe=df),
    }
)

files = wandb.Artifact("data", type="dataset")

files.add_file("schema.json")
files.add_file("results.jsonlines")
files.add_file("results.csv")


run.log_artifact(files)
run.finish()

In [33]:
import asyncio
import time
import pandas as pd
import wandb
from src.helpers import *

model = "gpt-3.5-turbo"
temp = 0

run = wandb.init(
    project="query",
    config={"model": model, "temp": temp},
)

test_queries = [
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call?",
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 and quarter 3?",
    "Compare the total revenue generated by Apple and Amazon for the year 2023 and quarter 4",
]
start = time.perf_counter()
queries = await asyncio.gather(
    *[expand_query(q) for q in test_queries]
)
duration = time.perf_counter() - start

with open("schema.json", "w+") as f:
    schema = Query.model_json_schema()
    json.dump(schema, f, indent=2)

with open("results.jsonlines", "w+") as f:
    for query in queries:
        f.write(query.model_dump_json() + "\n")

df = dicts_to_df([q.report() for q in queries])
df["input"] = test_queries
df.to_csv("results.csv")


run.log({"schema": wandb.Table(dataframe=pd.DataFrame([{"schema": schema}]))})

run.log(
    {
        "usage_total_tokens": df["usage_total_tokens"].sum(),
        "usage_completion_tokens": df["usage_completion_tokens"].sum(),
        "usage_prompt_tokens": df["usage_prompt_tokens"].sum(),
        "duration (s)": duration,
        "average duration (s)": duration / len(queries),
        "n_queries": len(queries),
    }
)


run.log(
    {
        "results": wandb.Table(dataframe=df),
    }
)

files = wandb.Artifact("data", type="dataset")

files.add_file("schema.json")
files.add_file("results.jsonlines")
files.add_file("results.csv")


run.log_artifact(files)
run.finish()

In [34]:
import asyncio
import time
import pandas as pd
import wandb
from src.helpers import *
import json
model = "gpt-3.5-turbo"
temp = 0

run = wandb.init(
    project="query",
    config={"model": model, "temp": temp},
)

test_queries = [
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call?",
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 and quarter 3?",
    "Compare the total revenue generated by Apple and Amazon for the year 2023 and quarter 4",
]
start = time.perf_counter()
queries = await asyncio.gather(
    *[expand_query(q) for q in test_queries]
)
duration = time.perf_counter() - start

with open("schema.json", "w+") as f:
    schema = Query.model_json_schema()
    json.dump(schema, f, indent=2)

with open("results.jsonlines", "w+") as f:
    for query in queries:
        f.write(query.model_dump_json() + "\n")

df = dicts_to_df([q.report() for q in queries])
df["input"] = test_queries
df.to_csv("results.csv")


run.log({"schema": wandb.Table(dataframe=pd.DataFrame([{"schema": schema}]))})

run.log(
    {
        "usage_total_tokens": df["usage_total_tokens"].sum(),
        "usage_completion_tokens": df["usage_completion_tokens"].sum(),
        "usage_prompt_tokens": df["usage_prompt_tokens"].sum(),
        "duration (s)": duration,
        "average duration (s)": duration / len(queries),
        "n_queries": len(queries),
    }
)


run.log(
    {
        "results": wandb.Table(dataframe=df),
    }
)

files = wandb.Artifact("data", type="dataset")

files.add_file("schema.json")
files.add_file("results.jsonlines")
files.add_file("results.csv")


run.log_artifact(files)
run.finish()

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111358295555773, max=1.0)…

In [35]:
from openai import AsyncOpenAI
from typing import List, Literal
from enum import Enum
from datetime import datetime
import pandas as pd
# We'll use a different client for async calls
# To highlight the difference and how we can use both
aclient = instructor.patch(AsyncOpenAI())

class TickerYearQuarter(BaseModel):
    chain_of_thought: str = Field(
        description="Think step by step to output what is the ticker symbols, NOT THE COMPANY NAME, quarter, year and data source the question is talking about"
    )
    ticker:List[str] 
    year: List[str] = Field(description="The year that the question is talking about")
    quarter: List[str] = Field(description="The quarter number that the question is talking about. Make sure that it starts with Q, for example Quarter 4 is Q4")
    data_source: Literal["CALLS","SEC"] = Field(description="If the question is talking about SEC filings then output SEC, else if the question is talking about Earning calls transcript then output CALLS")

class Query(BaseModel):
    rewritten_query: str = Field(description="Rewrite the query and DON'T include the company name, years, quarters and data sources")
    question_ticker_quarter_year: TickerYearQuarter
    def report(self):
        dct = self.model_dump()
        dct["usage"] = self._raw_response.usage.model_dump()
        return dct

async def expand_query(q) -> Query:
    datetime_obj =  datetime.today().strftime("%Y-%m-%d")
    quarter = pd.Timestamp(datetime_obj).quarter
    year = pd.Timestamp(datetime_obj).year
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.0,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for SEC Filings and Earnings Call. The current year is {year} and quarter {quarter}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )

In [36]:
from openai import AsyncOpenAI
from typing import List, Literal
from enum import Enum
from datetime import datetime
import pandas as pd
# We'll use a different client for async calls
# To highlight the difference and how we can use both
aclient = instructor.patch(AsyncOpenAI())

class TickerYearQuarter(BaseModel):
    chain_of_thought: str = Field(
        description="Think step by step to output what is the ticker symbols, NOT THE COMPANY NAME, quarter, year and data source the question is talking about"
    )
    ticker:List[str] 
    year: List[str] = Field(description="The year that the question is talking about")
    quarter: List[str] = Field(description="The quarter number that the question is talking about. Make sure that it starts with Q, for example Quarter 4 is Q4")
    data_source: Literal["CALLS","SEC"] = Field(description="If the question is talking about SEC filings then output SEC, else if the question is talking about Earning calls transcript then output CALLS")

class Query(BaseModel):
    rewritten_query: str = Field(description="Rewrite the query and DON'T include the company name, years, quarters and data sources")
    question_ticker_quarter_year: TickerYearQuarter
    def report(self):
        dct = self.model_dump()
        dct["usage"] = self._raw_response.usage.model_dump()
        return dct

async def expand_query(q) -> Query:
    datetime_obj =  datetime.today().strftime("%Y-%m-%d")
    quarter = pd.Timestamp(datetime_obj).quarter
    year = pd.Timestamp(datetime_obj).year
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.0,
        response_model=Query,
        messages=[
            {
                "role": "system",
                "content": f"You're a query understanding system for SEC Filings and Earnings Call. The current year is {year} and quarter {quarter}. Here are some tips: ...",
            },
            {"role": "user", "content": f"query: {q}"},
        ],
    )

In [37]:
import asyncio
import time
import pandas as pd
import wandb
from src.helpers import *
import json
model = "gpt-3.5-turbo"
temp = 0

run = wandb.init(
    project="query",
    config={"model": model, "temp": temp},
)

test_queries = [
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 from earnings call?",
    "What did Apple and Amazon do for Generative AI in 2023 in quarter 4 and quarter 3?",
    "Compare the total revenue generated by Apple and Amazon for the year 2023 and quarter 4 from earnings call",
    "How did AWS from Amazon perform for the year 2023 and quarter 4?"
    "Compare the iPhone sales for Apple from 2023 quarter 3 to quarter 4"
]
start = time.perf_counter()
queries = await asyncio.gather(
    *[expand_query(q) for q in test_queries]
)
duration = time.perf_counter() - start

with open("schema.json", "w+") as f:
    schema = Query.model_json_schema()
    json.dump(schema, f, indent=2)

with open("results.jsonlines", "w+") as f:
    for query in queries:
        f.write(query.model_dump_json() + "\n")

df = dicts_to_df([q.report() for q in queries])
df["input"] = test_queries
df.to_csv("results.csv")


run.log({"schema": wandb.Table(dataframe=pd.DataFrame([{"schema": schema}]))})

run.log(
    {
        "usage_total_tokens": df["usage_total_tokens"].sum(),
        "usage_completion_tokens": df["usage_completion_tokens"].sum(),
        "usage_prompt_tokens": df["usage_prompt_tokens"].sum(),
        "duration (s)": duration,
        "average duration (s)": duration / len(queries),
        "n_queries": len(queries),
    }
)


run.log(
    {
        "results": wandb.Table(dataframe=df),
    }
)

files = wandb.Artifact("data", type="dataset")

files.add_file("schema.json")
files.add_file("results.jsonlines")
files.add_file("results.csv")


run.log_artifact(files)
run.finish()