In [1]:
from langchain_openai import ChatOpenAI
from typing import Annotated

from typing_extensions import TypedDict

from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from IPython.display import Image, display


from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLSectionSplitter


OPENAI_API_KEY = ''

import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [2]:
docs = []
documents_path = '../data/financial_reports/earning_call_scripts/Transcripts/AMZN'

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    length_function=len,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
        "--------------------------------------------------------------------------------"
    ],
)

In [4]:
import string

def has_high_percentage_of_punctuation(input_string, threshold=0.2):
    """
    Check if a string has a high percentage of punctuation marks.
    
    Parameters:
    - input_string (str): The string to check.
    - threshold (float): The percentage threshold (e.g., 0.3 for 30%).
    
    Returns:
    - bool: True if the percentage of punctuation marks exceeds the threshold, otherwise False.
    """
    # Count the number of punctuation marks in the string
    punctuation_count = sum(1 for char in input_string if char in string.punctuation)
    
    # Calculate the percentage of punctuation marks
    total_chars = len(input_string)
    if total_chars == 0:
        return False  # Avoid division by zero
    
    punctuation_percentage = punctuation_count / total_chars
    
    # Check if the percentage exceeds the threshold
    return punctuation_percentage > threshold



In [5]:
from langchain_core.documents import Document
from datetime import datetime
import re


list_of_documents = []
metadata = []
just_documents = []

for doc in os.listdir(documents_path):
    document = open(os.path.join(documents_path, doc), 'r').read()
    split_text = text_splitter.split_text(document)
    for s in split_text:
        if len(s.split()) < 30:
            continue
        if has_high_percentage_of_punctuation(s):
            continue
        else:
            lang_doc = Document(page_content=re.sub(r'-{3,}', ' ', s))
            date = doc.split('-')[:-1]
            date = datetime.strptime(' '.join(date), '%Y %b %d').timestamp()
            date = int(date)
            
            lang_doc.metadata = {'company': 'AMZN', 'date': date}
            metadata.append({'company': 'AMZN', 'date': date})
            list_of_documents.append(lang_doc)
            just_documents.append(re.sub(r'-{3,}', ' ', s))

In [6]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(list_of_documents))]

In [7]:
embeddings = OpenAIEmbeddings()

In [8]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="transcripts",
    embedding_function=embeddings,
    persist_directory="./rag",)

In [9]:
vector_store.add_documents(documents=list_of_documents, ids=uuids)


['fd48d4a2-01a9-4ebd-9b34-3e779540f4c4',
 '49967722-a443-4662-a222-8bf9e78c00d7',
 'e75e05ca-6f3b-4c5f-8b39-d7bff252db9b',
 '91694418-5ff7-42ff-a018-2e8b98f5ff87',
 '76314e33-5013-4c92-aa15-65b38f6d81ad',
 'fe6d49a6-3d78-4e74-9e78-5187c448f5b7',
 '83c84f99-1570-4a1d-b1f5-ede15ddbb9c7',
 '49b58786-f511-4387-b3e0-49cba8d79b62',
 'ff5af806-846d-47ec-9303-e1050d8c6d67',
 '54faed4f-5f87-4f77-b68a-bccb4fd245e4',
 '02eb820e-8d91-4562-8350-50d91151dec8',
 'b46251d1-5cea-43ef-9e88-c25170a1a98f',
 '47eba3ba-0ef5-4392-af4c-c185c1f622a6',
 'da3ac3d3-ba7e-4278-84de-044851daefc2',
 'b66955b2-f018-4a19-b26c-5920fce8a032',
 'e12b9945-a144-48d2-9d8c-611db0d5f97a',
 'f9685869-fec8-4171-b23c-d189469439e1',
 'b2e5b6c4-b5ba-4c00-a14a-4e66fdba9890',
 'c3cb4dfd-1ba3-4bed-86fb-f55c19210bff',
 '275c5ba5-f896-47b1-9ae6-e117ed120bd9',
 'f28497e1-a57d-4a26-9163-b28c3cc1f4a9',
 'afa741e9-ee2e-4ff8-b5eb-1fd000185e0a',
 '22c229f2-8ece-4ab0-8a1a-c2d0a169cbef',
 'df38f866-0b09-4027-8580-7e2061c7ad2a',
 'e16a9bd5-5c4f-

In [10]:
# Define the retrieval function
def retrieve_documents(query):
    return vector_store.similarity_search(query, k=5)  # Retrieve top 5 relevant documents

retriever = vector_store.as_retriever()

In [12]:
# Define the date range for filtering
start_date = "2016-01-01"
end_date = "2016-08-31"
where_document = {
    "date": {"$gte": int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())} #}, {'date': {"$lte": end_date}}]
}

where_document={"$and": [{ "date": {"$gte": int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())}},
                         { "date": {"$lte": int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())}}
                         ]}

In [13]:
collection_name = "transcrips"
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), model_name='text-embedding-3-small')


client = chromadb.Client()  # Or your specific client configuration
collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

In [14]:
collection.add(documents=just_documents,
               metadatas=metadata,
               ids=uuids)

In [16]:
query = 'What is Amazon revenue?'
results_with_similarity = collection.query(
    query_texts=query,
    where=where_document,
    n_results=5  # Number of results to return
)

In [17]:
results_with_similarity

{'ids': [['0a0bb2f8-a4a7-4e95-a10f-1c3d3012a376',
   '75b26d12-0797-4441-afeb-ba0a4fc77580',
   'bea4f731-b50d-4bee-887a-2af01df5f5a5',
   '47584c8d-ac6f-43fc-8fff-1be88b82905e',
   '83cfa95b-2a48-4c69-8840-0c22dbfa1351']],
 'embeddings': None,
 'documents': [["In the North America segment, revenue grew 27% to $17 billion. Media revenue grew 8% to $3.2 billion. EGM revenue grew 32% to $13.5 billion. North America segment operating income including stock-based compensation and other was $588 million, a 3.5% operating margin compared with $254 million in the prior year. This includes $5 million of favorable impact from foreign exchange. North America's segment operating income before stock-based compensation and other was $924 million, a 5.4% operating margin compared with $517 million in the prior year.\nIn the international segment, revenue grew 24% to $9.6 billion. Excluding the $177 million year-over-year unfavorable foreign exchange impact, revenue growth was 26%. Media revenue incr

### Define retriever tool

In [18]:
from langchain.tools import BaseTool, StructuredTool, tool


class Retriever:
    # scratch class
    def __init__(self, db, collection_name):
        self.embedding_function = embedding_function
        self.collection = db.get_or_create_collection(name=collection_name, embedding_function=self.embedding_function)
            
    def retrive(self, query, company, start_date, end_date, n=5):
        where_document = None 
        if start_date and end_date:
            where_document={"$and": [{ "date": {"$gte": int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())}},
                                     { "date": {"$lte": int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())}}
                                     ]}
        results_with_similarity = collection.query(
        query_texts=query,
        where=where_document,
        n_results=n
        )
        result_docs = results_with_similarity['documents'][0]
        return result_docs
            

In [19]:
def retriever_tool(query, company='AMZN', start_date=None, end_date=None):
    agent = Retriever(client, collection_name)
    result = agent.retrive(query, company, start_date, end_date)
    return result

In [20]:
from langchain.tools import Tool
from langchain.tools.base import StructuredTool

from pydantic import BaseModel, Field

class RetrieverInput(BaseModel):
    query: str = Field(description="Query for the vector database")
    company: str = Field(description="Which company is mentioned in the query?")
    start_date: str = Field(description='Optional. Start date for search documents in the db.')
    end_date: str = Field(description='Optional. End date for search documents in the db.')


retriver_tool = StructuredTool(
    name="Retriever",
    args_schema=RetrieverInput,
    description=(
        "Use this tool to get relevant documents for Amazon from transcripts. When dates are specified in the query, convert them into format like 2016-01-10."
    ),
    func=retriever_tool,
)

### Define sentiment tool

In [None]:
from setfit import AbsaModel


def analyze_sentiment(docs: list) -> list:
    model = AbsaModel.from_pretrained(
        'Askinkaty/setfit-finance-aspect',
        'Askinkaty/setfit-finance-polarity',
        spacy_model="en_core_web_sm",
    )
    results = []
    batch_size = 32
    for i in range(0, len(docs), batch_size):
        batch = docs[i:i + batch_size]  # Get the batch as a list
        predictions = model.predict(batch)  # Batch predict
        results.extend(predictions)
    return results


sentiment_tool = Tool(
    name='Sentiment_analyzer',
    description="Use this tool to get sentiment of the document related to some particular companies/products/entities. It will return negative/positive/neutral label.",
    func=analyze_sentiment
)

### Simple agent

In [None]:
from langchain.agents import initialize_agent, AgentType


# Initialize LLM
llm = ChatOpenAI(model="gpt-4")

# Add the tools to the list of tools
tools = [retriver_tool, sentiment_tool]

# Initialize the agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

In [24]:
# Test the agent with a correct rate query
response = agent.invoke(
    #input="Tell about how popular is Alexa? How many home devices were sold? Use only information from documents starting from the beginning of 2016 until the end of 2016."
    #input="Tell about the revenue of AWS. Use only information from documents starting from the beginning of 2016 until the end of 2016."
    input='Tell what is the general sentiment of discussions about AWS in April 2016.'
)
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find the sentiment about AWS in April 2016, I first need to retrieve the relevant documents from that time period. 

Action:
```
{
  "action": "Retriever",
  "action_input": {
    "query": "AWS",
    "company": "Amazon",
    "start_date": "2016-04-01",
    "end_date": "2016-04-30"
  }
}
```[0m2016-04-01 2016-04-30
WHERE 2016-04-01 2016-04-30 {'$and': [{'date': {'$gte': 1459458000}}, {'date': {'$lte': 1461963600}}]}

Observation: [36;1m[1;3m["Thanks. Your capital lease-driven property and equipment acquisitions is down again year-over-year. So, will you help tie this to perhaps the overall usage growth at AWS? Or, maybe the changing nature of how your enterprise customers may be using the platform to be more compute versus storage or database-heavy? I think historically on the e-commerce side, you have been price followers as opposed to price leaders. AWS, you have been price leaders for the most part for active