In [1]:
import os
import json
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
#os.getenv('OPENAI_API_KEY')

### Load Data

In [7]:
from langchain.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader("data/articles", glob="./*.html", loader_cls=TextLoader)
documents = loader.load()
len(documents)

274

### Split Data

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)
len(documents)

1551

### Create ChromaDB Database (Vector Database)

In [9]:
from langchain import embeddings
from langchain_openai import OpenAIEmbeddings

db_directory = 'db'

embedding = OpenAIEmbeddings(model="text-embedding-3-small")
# vector_db = Chroma.from_documents(
#    documents=documents,
#    embedding=embedding,
#    persist_directory=db_directory
# )

### Load Vector Data for Usage

In [10]:
from langchain.vectorstores import Chroma

vector_db = Chroma(persist_directory=db_directory, embedding_function=embedding)

### Data Retrieval For Similarity Search

In [11]:
retriever = vector_db.as_retriever()
retriever.invoke("How much microsoft raised?")

[Document(page_content='Stifel  analyst Brad Reback   maintains Microsoft (NASDAQ:<a class="ticker" href="https://www.benzinga.com/stock/MSFT#NASDAQ">MSFT</a>) with a Buy and raises the price target from $290 to $310.<p>Copyright &#169; Benzinga. All rights reserved. Write to editorial@benzinga.com with any questions about this content. Benzinga does not provide investment advice.</p>', metadata={'source': 'articles_data/2023-04-14-MSFT-BZ$143319c3.html'}),
 Document(page_content='BMO Capital  analyst Keith Bachman   maintains Microsoft (NASDAQ:<a class="ticker" href="https://www.benzinga.com/stock/MSFT#NASDAQ">MSFT</a>) with a Market Perform and raises the price target from $305 to $310.<p>Copyright &#169; Benzinga. All rights reserved. Write to editorial@benzinga.com with any questions about this content. Benzinga does not provide investment advice.</p>', metadata={'source': 'articles_data/2023-04-14-MSFT-BZ$14331e25.html'}),
 Document(page_content='total cost received by the writing

### Feed Retrieved Documents to LLM to get Exact Answer

In [12]:
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False
)
chain.invoke("what is the news about microsoft?")

{'query': 'what is the news about microsoft?',
 'result': " The news is about a leaked video showing Microsoft working on a Windows handheld gaming mode that would allow gamers to play their favorite games on the go. Microsoft has also signed a 10-year partnership with BT Group's EE to expand their cloud gaming offerings. Additionally, the U.K.'s Competition and Markets Authority has provided a provisional update indicating that their inquiry does not anticipate any significant anticompetitive effects resulting from Microsoft's acquisition of Activision Blizzard."}

In [13]:
chain.invoke("what is the news about apple?")

{'query': 'what is the news about apple?',
 'result': " The news is about Apple's upcoming Worldwide Developers Conference, where they are expected to unveil a new mixed-reality headset and update their products, including MacBooks and the Apple Watch. There may also be changes to iOS, potentially allowing for sideloading of apps, but this is not a major focus for this year's updates."}