In [None]:
!pip install -U langchain langchain-openai langchain-community langchain-huggingface chromadb

In [None]:
import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")
os.environ["LANGCHAIN_PROJECT"] = "practice-RAG"

In [None]:
import logging
from dataclasses import dataclass
import datasets

In [None]:
# LangChain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
# ----------------------------------------------------------------------
# Logging Setup
# ----------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s"
)
logger = logging.getLogger("RAG-LangChain-Example")

In [None]:
# ----------------------------------------------------------------------
# Configuration
# ----------------------------------------------------------------------
@dataclass
class RAGConfig:
    """
    Holds configuration variables for the RAG pipeline.
    """
    dataset_name: str = "ag_news"           # Hugging Face dataset to load
    dataset_split: str = "train[:1000]"     # only load a slice for demonstration
    chunk_size: int = 512
    chunk_overlap: int = 50
    hf_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
    retrieval_qa_chat_prompt: str = "langchain-ai/retrieval-qa-chat" 
    vectorstore_persist_dir: str = "./chroma_data"
    openai_model_name: str = "gpt-4o-mini"   # or "gpt-4o"
    openai_temperature: float = 0.0
    top_k: int = 3  
    
@dataclass
class NewsDataItem:
    """
    Represents a single news item with text and metadata.
    """
    text: str
    label: int

In [None]:
# Dataset 로딩
logger.info(f"Loading dataset: {RAGConfig.dataset_name} [{RAGConfig.dataset_split}]")
ds = datasets.load_dataset(RAGConfig.dataset_name, split=RAGConfig.dataset_split)

# Convert to a simpler python list of NewsDataItem
data_items = []
for row in ds:
    data_items.append(NewsDataItem(text=row["text"], label=row["label"]))
logger.info(f"Loaded {len(data_items)} news items.")

In [None]:
# Document 생성
from langchain.schema import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=RAGConfig.chunk_size,
    chunk_overlap=RAGConfig.chunk_overlap,
    length_function=len,
    is_separator_regex=False
)

# Convert each NewsDataItem to a Document
docs = []
for item in data_items:
    # metadata example: store label
    splitted = text_splitter.split_text(item.text)
    for chunk in splitted:
        docs.append(Document(page_content=chunk, metadata={"label": item.label}))

logger.info(f"Total chunked documents: {len(docs)}")

In [None]:
docs[52]

In [None]:
# ChromaDB 설정

from chromadb.config import Settings
logger.info("Initializing HuggingFace embeddings...")
embedding_fn = HuggingFaceEmbeddings(model_name=RAGConfig.hf_embedding_model)

logger.info("Building Chroma vector store...")

# If you want persistence:
vectorstore = Chroma.from_documents(
    collection_name="news_collection",
    embedding=embedding_fn,
    documents=docs,
    persist_directory=RAGConfig.vectorstore_persist_dir,
    client_settings=Settings(anonymized_telemetry=False)
)

In [None]:
vectorstore.get(ids="5fab8d63-5fa0-4eeb-b328-38571818f8fb")

In [None]:
"""
Builds a RetrievalQA chain with an OpenAI LLM as generator.
"""
logger.info("Initializing OpenAI LLM and retrieval chain...")
_llm = ChatOpenAI(
    model_name=RAGConfig.openai_model_name,
    temperature=RAGConfig.openai_temperature
)

# Retrieve top-k docs
retriever = vectorstore.as_retriever(
    search_kwargs={"k": RAGConfig.top_k}
)

# pull prompt
logger.info("Pull QA Chat Prompt from hub...")
_qa_prompt = hub.pull(f"{RAGConfig.retrieval_qa_chat_prompt}")

# Build the chain
logger.info("Creating RAG Chain...")
combine_docs_chain = create_stuff_documents_chain(_llm, _qa_prompt)
_qa_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# LCEL 인터페이스 사용
_qa_chain_lcel = (
    {"context": retriever, "input": RunnablePassthrough()}
    | _qa_prompt
    | _llm
    | StrOutputParser()
)

In [None]:
_qa_chain_lcel.invoke("When does google open their auction?")

In [None]:
_qa_chain.invoke({"input": "When does google open their auction?"})