# Data Ingestion

In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
!unzip -q new_articles.zip -d new_articles

In [None]:
!pip install langchain langchain-openai langchain-chroma chromadb langchain-community

In [None]:
!pip show langchain langchain-openai langchain-chroma chromadb langchain-community

In [None]:
!pip freeze > requirements.txt

In [None]:
!cat requirements.txt

In [None]:
from google.colab import files
files.download('requirements.txt')

In [None]:
# !pip install langchain-community

## Load data

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [None]:
# Load documents
loader = DirectoryLoader("/content/new_articles/", glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()
documents

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
texts

In [None]:
len(texts)

In [None]:
texts[0]

In [None]:
texts[1]

## Creating DB

In [None]:
# !pip install langchain langchain-openai langchain-chroma chromadb

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

# 1) Create the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key = OPENAI_API_KEY)

# 2) Persist directory for storing the vector DB
persist_directory = "db"

# 3) Create and persist the vector store from documents
vectordb = Chroma(
    collection_name="my_collection",               # required
    embedding_function=embeddings,
    persist_directory=persist_directory            # enables persistence
)


# Add documents to the vector DB
# documents should be a list of Document objects
vectordb.add_documents(documents = texts)

In [None]:
# Save changes
# vectordb.persist()

# Clear in-memory reference
vectordb = None

In [None]:
# 4) Load the persisted DB back from disk
vectordb = Chroma(
    collection_name="my_collection",               # same collection name
    embedding_function=embeddings,
    persist_directory=persist_directory
)

## Make a retriever

In [None]:
# 5) Create a retriever
retriever = vectordb.as_retriever()

# 6) Get relevant docs for a query
docs = retriever.invoke("How much money did Microsoft raise?")

In [None]:
len(docs)

In [None]:
docs

In [None]:
# 7) Example of customizing search parameters
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

# You can inspect these values
print(retriever.search_type)
print(retriever.search_kwargs)

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

## Make a chain

In [None]:
# !pip show langchain langchain-openai langchain-chroma chromadb

In [None]:
# !pip install langchain

In [None]:
# !pip install langchain-community


In [None]:
# !pip show langchain langchain-openai langchain-chroma chromadb langchain-community

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

In [None]:
# 1️) LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key = OPENAI_API_KEY)

# 2️) Prompt template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the question using ONLY the context below.
    If the answer is not in the context, say "I don't know".

    Context:
    {context}

    Question:
    {question}
    """
)

# 3️) LCEL RAG Chain

full_rag_chain = (
    RunnablePassthrough.assign(context=itemgetter("question") | retriever)
    | {
        "answer": prompt | llm | StrOutputParser(),
        "context": itemgetter("context")
    }
)

In [None]:
# 4) Helper function
def process_llm_response(response):
    print("Answer:\n")
    print(response["answer"])
    print("\nSources:")
    for doc in response["context"]:
        source = doc.metadata.get("source", "Unknown source")
        print(f"- {source}")

In [None]:
# Full example
query = "How much money did Microsoft raise?"
response = full_rag_chain.invoke({"question": query})
process_llm_response(response)

In [None]:
# Another query
query = "What is the news about Pando?"
response = full_rag_chain.invoke({"question": query})
process_llm_response(response)

## Deleteing the DB


In [None]:
!zip -r db.zip ./db

In [None]:
# 1. Access the underlying client to delete
try:
    vectordb._client.delete_collection("my_collection")
    print("Collection deleted successfully.")
except Exception as e:
    print(f"Collection might not exist: {e}")

# 2. Manual directory cleanup (Optional but recommended for a total reset)
import shutil
import os

db_path = "db"
if os.path.exists(db_path):
    shutil.rmtree(db_path)
    print("Database directory removed.")

## Starting again loading the db

In [None]:
!unzip db.zip