# Initialization

## Load libraries

In [1]:
import os
from dotenv import load_dotenv

from langchain_chroma.vectorstores import Chroma # type: ignore
from langchain_openai.embeddings import OpenAIEmbeddings # type: ignore
from langchain_openai.llms import OpenAI # type: ignore
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import embeddings
from langchain.chains import RetrievalQA

## Set constants

In [2]:
ARTICLES_PATH = "articles"
PERSIST_DIRECTORY = "db"

## Load environment variables

In [3]:
_ = load_dotenv()

# Manage articles

## Load articles

In [4]:
loader = DirectoryLoader(
    ARTICLES_PATH,
    glob = "./*.txt"
)

In [5]:
data = loader.load()

## Split articles

In [6]:
recursive_character_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
data_chunks = recursive_character_text_splitter.split_documents(data)

## Set up embedding model

In [7]:
embedding = OpenAIEmbeddings(openai_api_key = os.getenv("OPENAI_API_KEY"))

## Create Local Chroma Vector DB and Upload Embedded and Chunked Articles

In [8]:
_ = Chroma.from_documents(
    documents=data_chunks,
    embedding=embedding,
    persist_directory=PERSIST_DIRECTORY
)

# Use the Vector DB for Retrieval-Augmented Generation (RAG)

## Connect to Vector DB

In [9]:
vectordb = Chroma(
    persist_directory=PERSIST_DIRECTORY,
    embedding_function=embedding
)

## Use OpenAI's API to connect to the LLM

In [10]:
llm = OpenAI(
    temperature=0.5, 
    openai_api_key = os.getenv("OPENAI_API_KEY")
)

## Set up retriever

In [11]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [12]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

In [13]:
qa_chain.invoke("How much money did Microsoft raise?")

{'query': 'How much money did Microsoft raise?',
 'result': ' $10 billion',
 'source_documents': [Document(metadata={'source': 'articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt'}, page_content='April 28, 2023\n\nVC firms including Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global are picking up new shares, according to documents seen by TechCrunch. A source tells us Founders Fund is also investing. Altogether the VCs have put in just over $300 million at a valuation of $27 billion to $29 billion. This is separate to a big investment from Microsoft announced earlier this year, a person familiar with the development told TechCrunch, which closed in January. The size of Microsoft’s investment is believed to be around $10 billion, a figure we confirmed with our source.\n\nApril 25, 2023\n\nCalled ChatGPT Business, OpenAI describes the forthcoming offering as “for professionals who need more control over their data as well as enterprises seeking t