In [1]:
pip show chromadb

Name: chromadb
Version: 1.0.16
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: E:\DataScience\anaconda\envs\asif\Lib\site-packages
Requires: bcrypt, build, grpcio, httpx, importlib-resources, jsonschema, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-sdk, orjson, overrides, posthog, pybase64, pydantic, pypika, pyyaml, rich, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
import zipfile
import os

zip_path = "new_articles.zip"
extract_path = "new_articles"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Extraction complete!")


✅ Extraction complete!


In [None]:
os.environ["OPENAI_API_KEY"]=""

In [44]:
from langchain.vectorstores import Chroma # chroma is local db using sqlite3 server and stores embedding in binary form
from langchain.embeddings import OpenAIEmbeddings 
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader

In [16]:
loader=DirectoryLoader("C:\\Users\\ASIF\\Gen AI\\ChromaDB\\new_articles", 
                       glob="./*.txt", 
                       loader_cls=TextLoader,
                       loader_kwargs={"encoding": "utf-8"}  # Force UTF-8
                      )

In [18]:
document=loader.load()

In [24]:
len(document)

21

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [77]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [78]:
text_chunks=text_splitter.split_documents(document)

In [79]:
text_chunks[0].page_content

'Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”'

In [80]:
len(text_chunks[0].page_content)

874

In [81]:
len(text_chunks)

233

In [82]:
from langchain import embeddings

In [83]:
persist_directory="db"

embedding=OpenAIEmbeddings()

In [84]:
vectordb=Chroma.from_documents(documents=text_chunks,
                              embedding=embedding,
                              persist_directory=persist_directory)

In [85]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x156d1a1f3b0>

In [86]:
## Persist the db to disk
vectordb.persist()

In [87]:
vectordb=None

In [None]:
## We can load persisted database from disk and use it as normal one

In [88]:
vectordb=Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [89]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x156d19872f0>

## Making a retriever

In [90]:
retriever=vectordb.as_retriever()

In [111]:
docs=retriever.get_relevant_documents("will ai replace tv writers?")

In [112]:
docs[0].page_content

'As the Writers Guild of America strikes for the first time since its historic 100-day action in 2007, Conover said he thinks the debate over AI technology is a “red herring.” With generative AI in such a rudimentary stage, writers are more immediately concerned with dismal streaming residuals and understaffed writing teams. Yet studios’ pushback on the union’s AI-related requests only further reinforces the core issue: The people who power Hollywood aren’t being paid their fair share.\n\n“I’m not worried about the technology,” Conover said. “I’m worried about the companies using technology, that is not in fact very good, to undermine our working conditions.”'

In [94]:
len(docs)

4

In [95]:
# To get only 2 relevant docs

retriever=vectordb.as_retriever(search_kwargs={"k":2})

In [113]:
docs2=retriever.get_relevant_documents("will ai replace tv writers?")

In [114]:
len(docs2)

2

# Making a chain

In [115]:
from langchain.chains import RetrievalQA

In [99]:
llm=OpenAI()

In [116]:
qa_chain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [117]:
def process_llm_response(llm_response):
    print(llm_response["result"])
    print("\n\nSourses : ")
    for source in llm_response["source_documents"]:
        print(source.metadata["source"])

In [118]:
query="will ai replace tv writers?"

In [119]:
llm_response=qa_chain(query)

In [120]:
llm_response

{'query': 'will ai replace tv writers?',
 'result': ' No, according to Conover, the debate over AI technology is a "red herring" and the current focus is on issues such as low pay and understaffing for TV writers. AI technology is still in a rudimentary stage and is not seen as a threat to replacing TV writers at this time.',
 'source_documents': [Document(metadata={'source': 'C:\\Users\\ASIF\\Gen AI\\ChromaDB\\new_articles\\05-03-ai-replace-tv-writers-strike.txt'}, page_content='As the Writers Guild of America strikes for the first time since its historic 100-day action in 2007, Conover said he thinks the debate over AI technology is a “red herring.” With generative AI in such a rudimentary stage, writers are more immediately concerned with dismal streaming residuals and understaffed writing teams. Yet studios’ pushback on the union’s AI-related requests only further reinforces the core issue: The people who power Hollywood aren’t being paid their fair share.\n\n“I’m not worried about

In [121]:
process_llm_response(llm_response)

 No, according to Conover, the debate over AI technology is a "red herring" and the current focus is on issues such as low pay and understaffing for TV writers. AI technology is still in a rudimentary stage and is not seen as a threat to replacing TV writers at this time.


Sourses : 
C:\Users\ASIF\Gen AI\ChromaDB\new_articles\05-03-ai-replace-tv-writers-strike.txt
C:\Users\ASIF\Gen AI\ChromaDB\new_articles\05-03-ai-replace-tv-writers-strike.txt


## To delete the Chroma DB

In [None]:
!zip -r db.zip ./db

In [None]:
## To Cleanup, we can delete the collection

vectordb.delete_collection()
vectordb.persist()

In [None]:
## To delete the directory

!rm -rf db/