In [1]:
# imports
from dotenv import load_dotenv

# load in the .env variables
load_dotenv()

True

In [2]:
# Read in State of the Union Address File
with open("RAG_Docs/2024_state_of_the_union.txt") as f:
    state_of_the_union = f.read()

# Read in Mangione CBS News Article
with open("RAG_Docs/2024_12_10_Mangione_CBS_Article.txt") as f:
    mangione_cbs_article = f.read()

In [3]:
# Imports
from langchain_text_splitters import CharacterTextSplitter

# Initialize Text Splitter
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Create Documents (Chunks) From File
state_of_the_union_texts = text_splitter.create_documents([state_of_the_union])
mangione_cbs_article_texts = text_splitter.create_documents([mangione_cbs_article])


In [6]:
# Add MetaData For Filter By File Name

# 2024_state_of_the_union.txt
for i, doc in enumerate(state_of_the_union_texts):
    doc.metadata = {
        'filename': '2024_state_of_the_union.txt',
        'chunk': i + 1
    }

# 2024_12_10_Mangione_CBS_Article.txt
for i, doc in enumerate(mangione_cbs_article_texts):
    doc.metadata = {
        'filename': '2024_12_10_Mangione_CBS_Article.txt',
        'chunk': i + 1
    }

In [7]:
# Create ChromaDB Vector Store

# Imports
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# Create Client
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection("test_collection")

# Set Up Embeddings Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Set Up ChromaDB As Vector Store
vector_store = Chroma(
    client=chroma_client,
    collection_name="test_collection",
    embedding_function=embeddings,
)

In [10]:
# Add Documents to Vector Store

# Concatenate Docs List
documents = state_of_the_union_texts + mangione_cbs_article_texts

# Add Documents to Vector Store
ids = vector_store.add_documents(documents)

In [15]:
# TEST SIMILARITY SEARCH
test_search_result = vector_store.similarity_search(
    "Who is Luigi Magione?",
    k=3
)

# Print Results
print(test_search_result)


# TEST FILTER SEARCH BY FILE NAME
vector_store.get(
    where={'filename': "2024_12_10_Mangione_CBS_Article.txt"}
)['documents']



[Document(metadata={'chunk': 14, 'filename': '2024_12_10_Mangione_CBS_Article.txt'}, page_content='Mangione\'s paternal grandparents, Nicholas and Mary Mangione, were real estate developers who purchased the Turf Valley Country Club in 1978 and Hayfields Country Club in Hunt Valley in 1986. \n\nThey founded Lorien Health Systems in 1977, and operated WCBM, a Baltimore radio station. Luigi Mangione volunteered at Lorien Health Systems in 2014 while in high school, according to his LinkedIn.\n\nMangione\'s family said Monday in a statement, "Unfortunately, we cannot comment on news reports regarding Luigi Mangione. We only know what we have read in the media. Our family is shocked and devastated by Luigi\'s arrest. We offer our prayers to the family of Brian Thompson and we ask people to pray for all involved. We are devastated by this news."\n\n### He was valedictorian at the Gilman School in Baltimore'), Document(metadata={'chunk': 1, 'filename': '2024_12_10_Mangione_CBS_Article.txt'},

["# What we know about Luigi Mangione, suspect charged in UnitedHealthcare CEO's killing\n\nBy Alex Sundby, Layla Ferris, Laura Doan, Emma Li, John Doyle\nUpdated on: December 10, 2024 / 8:36 PM EST / CBS News\n\nLuigi Mangione has been charged with murder in last week's deadly shooting of UnitedHealthcare CEO Brian Thompson, according to court documents filed Monday night. The 26-year-old, who was identified earlier as a person of interest, was arrested on firearms and other charges in Pennsylvania after being spotted at a McDonald's in Altoona amid a massive manhunt for the shooter.\n\n### Here's what we know about Mangione:\n\nLuigi Mangione identified as suspect in CEO shooting\nPolice said Tuesday it appears Mangione went to Pittsburgh and then Altoona after leaving New York and that he was dodging surveillance using a signal-blocking bag.",
 '"You can put your phone in there so we can\'t track your phone," Joseph Kenny, chief of detectives for the New York City Police Department,