<a href="https://colab.research.google.com/github/427paul/ai_agent/blob/main/ai_agent_02_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U "langchain==0.3.*" "langchain-core==0.3.*" "langchain-community==0.3.*" "langgraph==0.3.*" "langchain-huggingface" "huggingface_hub" "sentence-transformers" wikipedia -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
def load_api_keys(filepath="api_key.txt"):
    with open(filepath, "r") as f:
        for line in f:
            line = line.strip()
            if line and "=" in line:
                key, value = line.split("=", 1)
                os.environ[key.strip()] = value.strip()

path = '/content/drive/MyDrive/LangGraph/'

# API 키 로드 및 환경변수 설정
load_api_keys(path + 'api_key.txt')

# Document Loader

## html loader

In [None]:
!pip install unstructured

In [None]:
!pip install beautifulsoup4 lxml

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_community.document_loaders import UnstructuredHTMLLoader

# requires `pip install unstructured`
loader = UnstructuredHTMLLoader("sample.html")
data = loader.load()
print(data)
"""
[Document(page_content="Welcome to My Web Page\n\nThis is a simple HTML page. It's a great starting point for learning HTML.\n\nClick here to visit Example.com", metadata={'source': 'sample.html'})]
"""

from langchain.document_loaders import BSHTMLLoader

# requires `pip install beautifulsoup4 lxml`
loader = BSHTMLLoader("sample.html")
data = loader.load()
print(data)
"""
[Document(page_content="\n\nMy Simple Web Page\n\n\nWelcome to My Web Page\nThis is a simple HTML page. It's a great starting point for learning HTML.\nClick here to visit Example.com\n\n\n", metadata={'source': 'sample.html', 'title': 'My Simple Web Page'})]
"""

print(data[0].page_content)
"""


My Simple Web Page


Welcome to My Web Page
This is a simple HTML page. It's a great starting point for learning HTML.
Click here to visit Example.com



"""

## json loader

In [None]:
!pip install jq

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint

# required `pip install jq`
file_path='./facebook_chat.json'
data = json.loads(Path(file_path).read_text())
pprint(data)
"""
{'image': {'creation_timestamp': 1675549016, 'uri': 'image_of_the_chat.jpg'},
 'is_still_participant': True,
 'joinable_mode': {'link': '', 'mode': 1},
 'magic_words': [],
 'messages': [{'content': 'Bye!',
               'sender_name': 'User 2',
               'timestamp_ms': 1675597571851},
              {'content': 'Oh no worries! Bye',
               'sender_name': 'User 1',
               'timestamp_ms': 1675597435669},
              {'content': 'No Im sorry it was my mistake, the blue one is not '
                          'for sale',
               'sender_name': 'User 2',
               'timestamp_ms': 1675596277579},
              {'content': 'I thought you were selling the blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595140251},
              {'content': 'Im not interested in this bag. Im interested in the '
                          'blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595109305},
              {'content': 'Here is $129',
               'sender_name': 'User 2',
               'timestamp_ms': 1675595068468},
              {'photos': [{'creation_timestamp': 1675595059,
                           'uri': 'url_of_some_picture.jpg'}],
               'sender_name': 'User 2',
               'timestamp_ms': 1675595060730},
              {'content': 'Online is at least $100',
               'sender_name': 'User 2',
               'timestamp_ms': 1675595045152},
              {'content': 'How much do you want?',
               'sender_name': 'User 1',
               'timestamp_ms': 1675594799696},
              {'content': 'Goodmorning! $50 is too low.',
               'sender_name': 'User 2',
               'timestamp_ms': 1675577876645},
              {'content': 'Hi! Im interested in your bag. Im offering $50. Let '
                          'me know if you are interested. Thanks!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675549022673}],
 'participants': [{'name': 'User 1'}, {'name': 'User 2'}],
 'thread_path': 'inbox/User 1 and User 2 chat',
 'title': 'User 1 and User 2 chat'}
"""

loader = JSONLoader(
    file_path='./facebook_chat.json',
    jq_schema='.messages[].content',
    text_content=False)

data = loader.load()
pprint(data)
"""
[Document(page_content='Bye!', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 1}),
 Document(page_content='Oh no worries! Bye', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 2}),
 Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 3}),
 Document(page_content='I thought you were selling the blue one!', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 4}),
 Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 5}),
 Document(page_content='Here is $129', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 6}),
 Document(page_content='', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 7}),
 Document(page_content='Online is at least $100', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 8}),
 Document(page_content='How much do you want?', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 9}),
 Document(page_content='Goodmorning! $50 is too low.', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 10}),
 Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': '/Users/seungjoonlee/git/learn-langchain/document_loader/facebook_chat.json', 'seq_num': 11})]
"""

## pdf loader

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_community.document_loaders import PyPDFLoader

# required `pip install pypdf`
loader = PyPDFLoader("csv_sample.pdf")
pages = loader.load_and_split()

print(pages[0].page_content)
"""
csv_sample
Page 1nameagecountry
Neville Hardy 56Niue
Dacia Cohen 74Falkland Islands (Malvinas)
Kathey Daniel 10Slovenia
Mallie Welch 12Equatorial Guinea
Katia Bryant 14Ghana
Laurice Robertson 53Saudi Arabia
Minh Barrett 27French Southern Territories
Latashia Perez 52Finland
Elvina Ross 68New Zealand
"""

# Document Transformer

## text splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


with open('./state_of_the_union.txt') as f:
    state_of_the_union = f.read()

# default split on are ["\n\n", "\n", " ", ""]
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20, # max overlap
    length_function = len,
    add_start_index = True,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
"""
page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}
"""
print(texts[1])
"""
page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}
"""

## code splitter

In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)

print([e.value for e in Language])
"""
['cpp', 'go', 'java', 'kotlin', 'js', 'ts', 'php', 'proto', 'python', 'rst', 'ruby', 'rust', 'scala', 'swift', 'markdown', 'latex', 'html', 'sol', 'csharp', 'cobol']
"""

separators = RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)
print(separators)
"""
['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']
"""

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
print(python_docs)
"""
[Document(page_content='def hello_world():\n    print("Hello, World!")'), Document(page_content='# Call the function\nhello_world()')]
"""

## split by token

In [None]:
from langchain.text_splitter import CharacterTextSplitter


# required `pip install tiktoken`
with open('./state_of_the_union.txt') as f:
    state_of_the_union = f.read()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=0
)
texts = text_splitter.split_text(state_of_the_union)
print(len(texts))
print(texts[0])

# Text Embedding

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_openai import OpenAIEmbeddings

# requires `pip install openai`
embeddings_model = OpenAIEmbeddings()

# Embed list of texts
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
print(len(embeddings))
"""
5
"""

# length of 1536
print(len(embeddings[0]))
print(len(embeddings[1]))
"""
[... 0.022085255981691917, 0.025720015991018582, 0.008734743689027272, -0.006709843137048811, -0.022764415491192392, -0.00257671800269355, 0.010677894145694868, 0.0001446357869742665, -0.02568228625240111, -0.010438930752548039, -0.002831402818756228, -0.012992066737283132, 0.0015925658455746433, -0.021569597594135712, 0.011853846242120273, 0.015771589535625893, 0.006238204640524732, 0.02429881221167677, 0.014086268272736402, -0.024575506274763608, -0.021129402409104603, 0.007653119664435697, 0.006021250727232698, -0.02475158583889211, -0.012853719705739713, 0.018048030525951612, -0.0018441062839218978, -0.008445472115078739, -0.006885921304193508, 0.00240850043059146, 0.00827568270336489, -0.008030431020448483, -0.004181860777053302, 0.0010344603379206113, 0.007552503768493557, 0.01879007479579295, 0.008451761336170855, -0.014249769394680672, -0.03264995904888929, 0.004728961544779937, -0.0020343339179553677, -0.024927663540375542, -0.006565207350074544, -0.014765427782236877 ...]
"""

# Use Document Loader
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path='./csv_sample.csv')
data = loader.load()
print(data)
embeddings = embeddings_model.embed_documents(
    [
        text.page_content for text in data
    ]
)
print(len(embeddings))
"""
9
"""

# Embed single query
# Embed a single piece of text for the purpose of comparing to other embedded pieces of texts.
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
print(embedded_query[:5])
"""
[0.005354681365594307, -0.0005715346531097274, 0.03887590993433691, -0.0029596003572924623, -0.00896628532870428]
"""

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings # 최신 라이브러리 권장
from langchain_community.document_loaders import CSVLoader

# 1. 모델 설정 (무료 로컬 임베딩 모델)
# sentence-transformers/all-MiniLM-L6-v2: 가볍고 빠르며 384차원의 벡터를 생성함
# 한국어가 포함되어 있다면 "jhgan/ko-sroberta-multitask" 등을 추천합니다.
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}, # GPU가 있다면 'cuda'로 변경 가능
    encode_kwargs={'normalize_embeddings': True} # 유사도 검색을 위해 정규화 권장
)

# 2. 텍스트 리스트 임베딩
texts = [
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
]
embeddings = embeddings_model.embed_documents(texts)

print(f"임베딩된 문서 수: {len(embeddings)}")
print(f"벡터의 차원 수: {len(embeddings[0])}") # all-MiniLM-L6-v2 모델은 384 출력

# 3. CSV 로더 사용 예시
# (./csv_sample.csv 파일이 있어야 작동합니다)
try:
    loader = CSVLoader(file_path='./csv_sample.csv')
    data = loader.load()

    # 리스트 컴프리헨션으로 page_content만 추출하여 임베딩
    csv_embeddings = embeddings_model.embed_documents([doc.page_content for doc in data])
    print(f"CSV 임베딩 완료: {len(csv_embeddings)}개 문서")
except FileNotFoundError:
    print("CSV 파일이 없습니다. 경로를 확인해주세요.")

# 4. 단일 쿼리 임베딩
# 검색이나 질문을 던질 때 비교용으로 사용합니다.
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
print(f"쿼리 임베딩 결과(앞 5개): {embedded_query[:5]}")

# Vector Store

In [None]:
pip install chromadb

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma

# requires `pip install chromadb`
loader = CSVLoader(file_path='./fortune_500_2020.csv')
raw_documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
openai_embedding = OpenAIEmbeddings()
db = Chroma.from_documents(documents, openai_embedding, persist_directory="./fortune_500_db")

# save to disk
db.persist()

db_conn = Chroma(persist_directory="./fortune_500_db", embedding_function=openai_embedding)
query = "What is JPMorgan Revenue?"
docs = db_conn.similarity_search(query)
print(docs)
print(docs[0].page_content)

# # retriever
# db_conn = Chroma(persist_directory="./fortune_500_db", embedding_function=openai_embedding)
# retriever = db_conn.as_retriever()
# result = retriever.get_relevant_documents('walmart')
# print(result[0].page_content)
# """
# rank: 1
# company: Walmart
# no_of_employees: 2,200,000.00
# rank_change: None
# revenues: 523,964.00
# revenue_change: 0.02
# profits: 14,881.00
# profit_change: 1.23
# assets: 236,495.00
# market_value: 321,803.30
# """

In [None]:
from langchain_community.document_loaders import CSVLoader
# OpenAIEmbeddings 대신 HuggingFaceEmbeddings를 사용합니다.
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
import os

# 1. 문서 로드 (CSV 파일이 로컬에 있어야 함)
loader = CSVLoader(file_path='./fortune_500_2020.csv')
raw_documents = loader.load()

# 2. 텍스트 분할 (Chunking)
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

# 3. Hugging Face 임베딩 모델 설정
# 한국어/영어 범용으로 뛰어난 성능을 보이는 모델을 추천합니다.
# 영어 전용을 원하신다면 "sentence-transformers/all-MiniLM-L6-v2"를 사용하세요.
model_name = "jhgan/ko-sroberta-multitask"
hf_embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'}, # GPU가 있다면 'cuda'로 변경
    encode_kwargs={'normalize_embeddings': True}
)

# 4. 벡터 DB(Chroma) 생성 및 저장
# persist_directory에 DB 데이터가 저장됩니다.
db = Chroma.from_documents(
    documents,
    hf_embedding,
    persist_directory="./fortune_500_db"
)

# 최신 버전의 Chroma는 persist()를 명시적으로 호출하지 않아도 자동 저장되지만,
# 코드의 명확성을 위해 유지합니다. (일부 구버전 대응)
# db.persist()

# 5. DB 연결 및 데이터 검색
db_conn = Chroma(
    persist_directory="./fortune_500_db",
    embedding_function=hf_embedding
)

query = "What is JPMorgan Revenue?"
# 유사도 검색 수행
docs = db_conn.similarity_search(query, k=3)

print("--- 검색 결과 ---")
print(docs[0].page_content)

# 6. 리트리버(Retriever) 활용 예시
retriever = db_conn.as_retriever(search_kwargs={"k": 1})
result = retriever.invoke('walmart') # get_relevant_documents 대신 invoke 권장
print("\n--- 리트리버 검색 결과 (Walmart) ---")
print(result[0].page_content)

# Retriever Object

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8

# Build a sample vectorDB
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma

# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

question = "What are the approaches to Task Decomposition?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm)

# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
"""
INFO:langchain.retrievers.multi_query:Generated queries: [
    '1. How can Task Decomposition be achieved through different methods?',
    '2. What strategies are commonly used for breaking down tasks in Task Decomposition?',
    '3. What are the various techniques employed in Task Decomposition to simplify complex tasks?']
"""
print(len(unique_docs))
"""
6
"""

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8

# Build a sample vectorDB
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma

# --- 1. 문서 로드 및 전처리 ---
# 특정 기술 블로그 포스트를 웹에서 불러옵니다.
# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# 긴 문서를 500자 단위로 쪼갭니다. (의미 보존을 위해 Recursive 사용)
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# --- 2. 벡터 DB 생성 (HuggingFace 임베딩 모델 사용) ---
# 로컬 자원을 사용하는 임베딩 모델을 설정합니다.
# VectorDB
embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'}, # GPU가 있다면 'cuda'로 변경
    encode_kwargs={'normalize_embeddings': True}
)
# Chroma 벡터 DB에 문서 조각들을 저장합니다.
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain.retrievers.multi_query import MultiQueryRetriever

# --- 3. MultiQueryRetriever 설정 ---
# 질문을 확장할 LLM 모델을 설정합니다. (gpt-oss-20b 사용)
llm_ep = HuggingFaceEndpoint(repo_id="openai/gpt-oss-20b", task="text-generation")
llm = ChatHuggingFace(llm=llm_ep)

# [핵심] 사용자의 질문을 LLM이 3개 정도의 유사 질문으로 변형하도록 설정합니다.
# 이는 검색 키워드가 살짝 달라도 관련 문서를 더 잘 찾기 위함입니다.
# MultiQueryRetriever 프롬프으에 '3개의 질문을 생성하라'는 지시사항 포함되어 있음
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm)

# --- 4. 로그 설정 (LLM이 생성한 질문을 확인하기 위함) ---
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

# --- 5. 질문 실행 및 검색 결과 도출 ---
question = "What are the approaches to Task Decomposition?"

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
# unique_docs = retriever_from_llm.invoke(question)
"""
INFO:langchain.retrievers.multi_query:Generated queries: [
    '1. How can Task Decomposition be achieved through different methods?',
    '2. What strategies are commonly used for breaking down tasks in Task Decomposition?',
    '3. What are the various techniques employed in Task Decomposition to simplify complex tasks?']
"""
print(len(unique_docs))
"""
6
"""

# Context Compression

In [None]:
# Note as of 02/27/2024
# before you start you need to install the following
# pip install langchain==0.1.9 langchain-openai==0.0.8
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_community.vectorstores.chroma import Chroma

data = TextLoader('./state_of_the_union.txt').load()

# Split
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

# Helper function for printing docs
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("What did the president say about Ketanji Brown Jackson")
# pretty_print_docs(docs)
"""
Document 1:

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections.

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.
...
"""

llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
                            base_compressor=compressor,
                            base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(
    "What did the president say about Ketanji Brown Jackson")
pretty_print_docs(compressed_docs)
"""
Document 1:

"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence."
"""

In [None]:
from langchain_community.document_loaders import TextLoader
# Hugging Face용 임베딩과 모델 임포트
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint, ChatHuggingFace
from langchain.retrievers import ContextualCompressionRetriever
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_community.vectorstores.chroma import Chroma
import os

# 1. 문서 로드 (state_of_the_union.txt 파일이 필요합니다)
# 파일이 없다면 미리 생성하거나 경로를 확인하세요.
loader = TextLoader('./state_of_the_union.txt')
data = loader.load()

# 2. 문서 분할
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# 3. 로컬 임베딩 모델 설정
embedding = HuggingFaceEmbeddings(
    model_name="jhgan/ko-sroberta-multitask", # 한국어/영어 범용 모델
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# 4. Vector DB 생성 (Chroma)
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

# 5. Hugging Face LLM 설정 (압축 작업을 수행할 뇌 역할)
llm_ep = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta", # 요약 및 추출 성능이 좋은 모델 추천
    task="text-generation",
    max_new_tokens=512
)
llm = ChatHuggingFace(llm=llm_ep)

# 6. 문맥 압축 리트리버 설정
# LLMChainExtractor는 검색된 문서에서 질문과 관련된 부분만 추출합니다.
compressor = LLMChainExtractor.from_llm(llm)

base_retriever = vectordb.as_retriever()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

# 7. 실행 및 결과 출력
query = "What did the president say about Ketanji Brown Jackson"

# 일반 검색 결과
print("\n[일반 검색 결과 개수]:", len(base_retriever.invoke(query)))

# 압축 검색 결과 (핵심 내용만 추출됨)
compressed_docs = compression_retriever.invoke(query)

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

pretty_print_docs(compressed_docs)