## 환경 설정

### Modules

In [31]:
import os
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_experimental.tools import PythonAstREPLTool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from Modules import logging
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
from langchain_community.document_loaders.csv_loader import CSVLoader
from Modules.messages import display_message_tree

### DATA_PATH

In [2]:
DATA_PATH = "data/"

### Logging

In [3]:
logging.langsmith("Search for documents")

LangSmith 추적을 시작합니다.
[프로젝트명]
Search for documents


## 1. Lead_Data

In [4]:
loader = CSVLoader(
    file_path = DATA_PATH + "qna.csv", 
    encoding="utf-8",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames":[
            "question_title",
            "accepted_answer_body",
        ],
    },
)

In [5]:
docs = loader.load()

In [None]:
# print(data[1].page_content)

In [None]:
for doc in docs:
    print(doc.metadata)

In [9]:
type(docs)

list

## 2. Text Split

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=80)


In [None]:
split_docs= text_splitter.split_documents(docs)

## 3. Vector DB

In [18]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()  # 사용 중인 임베딩 모델에 따라 설정
vector_store = FAISS.from_documents(split_docs, embeddings)


  embeddings = OpenAIEmbeddings()  # 사용 중인 임베딩 모델에 따라 설정


In [None]:
vector_store.docstore.__dict__

{'_dict': {'36a71fcd-c1db-4b4a-89cf-f7dbbe2329df': Document(metadata={'source': 'data/qna.csv', 'row': 0}, page_content='question_title: question_title\naccepted_answer_body: accepted_answer_body'),
  '33db2ce7-ca36-4327-af2c-a270a996c601': Document(metadata={'source': 'data/qna.csv', 'row': 1}, page_content='question_title: @roo_validator error when importing langchain.text_splitter Python\naccepted_answer_body: <p>I was using LangChain 0.0.20 and I got the same issue. Upgrading to Python 3.9 and LangChain 0.0.224 fixed this issue for me.</p>'),
  '62c8cbcb-cd83-4a4f-a5db-7e6805cc7069': Document(metadata={'source': 'data/qna.csv', 'row': 2}, page_content='question_title: Any way to check if a number is already within a 3x3 grid in a 9x9 grid of nested lists?\naccepted_answer_body: <p>First you just need a given cell (row,col) which is the top left corner (the start) of the 3x3 subgrid you are trying to check:</p>\n<pre><code>start_row = (row // 3) * 3'),
  '11554a89-53d4-41fd-a500-4ab

In [23]:
vector_store.similarity_search("error when importing langchain.text_splitter Python", k = 2)

[Document(metadata={'source': 'data/qna.csv', 'row': 1}, page_content='question_title: @roo_validator error when importing langchain.text_splitter Python\naccepted_answer_body: <p>I was using LangChain 0.0.20 and I got the same issue. Upgrading to Python 3.9 and LangChain 0.0.224 fixed this issue for me.</p>'),
 Document(metadata={'source': 'data/qna.csv', 'row': 913}, page_content='<pre><code>from langchain.llms.openai import OpenAI\n</code></pre>\n<p>Maybe your python version installed an early verison of langchain due to dependency requirements</p>')]

In [22]:
vector_store.save_local(folder_path="data", index_name="faiss_index")

In [26]:
vector_store.index_to_docstore_id

{0: '36a71fcd-c1db-4b4a-89cf-f7dbbe2329df',
 1: '33db2ce7-ca36-4327-af2c-a270a996c601',
 2: '62c8cbcb-cd83-4a4f-a5db-7e6805cc7069',
 3: '11554a89-53d4-41fd-a500-4ab251a384fd',
 4: '4a88a6d8-fe65-40b7-a38d-31ce63c63b9e',
 5: '0b154506-6614-49ba-a92a-4227c57ca9be',
 6: 'fc8df0ae-b616-451d-a816-0c2d3a53e47d',
 7: 'fd08ee6e-5fbd-4fcb-8bbb-ba34d2f3c729',
 8: '4c5906f0-c7fa-4222-a704-788a3ad7822b',
 9: 'eea4fd52-61ad-42ea-ba9c-319f9cfd2a4f',
 10: '0c6fbcb5-176a-4f06-bd69-f15e0cd81878',
 11: '4f9c827c-2dc5-4bb8-9306-f32d80d2b037',
 12: 'b41a94cd-23f3-49f6-befc-4bd5c119f826',
 13: '51372f72-0681-426c-91ea-38a2ffe613ef',
 14: 'a0a55c59-2f09-4c6a-ae0b-b4e9acfe9a8e',
 15: 'c9bdf2e7-5744-48ce-9492-03b37e62bcce',
 16: '85d1158d-a67b-4333-9b2b-6735a2cc424e',
 17: '30932a60-7958-489a-af02-0bac8e566d40',
 18: 'f63fe398-dd4d-4a2f-8168-1577216fc0a9',
 19: 'bb35783c-6560-47d0-b13a-51818b492135',
 20: 'e6282f1a-efc4-49b8-980c-e2d95b7940c2',
 21: '291d4e6f-5654-4ab9-89f2-c9596fa23b1e',
 22: '8ce0fbde-888d-

## 4. Retriver

In [None]:
# retriever = vector_store.as_retriever(
#     search_type="mmr",
#     search_kwargs={"k": 2, "fetch_k": 10, "lambda_mult": 0.6}
# )

### 압축

In [46]:
from langchain_openai import OpenAI

# model="gpt-4o-mini"를 사용하면 오류 발생
llm = OpenAI(temperature=0)

In [47]:
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)

### Retrievers

In [48]:
from langchain.retrievers import ContextualCompressionRetriever

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 2, "fetch_k": 10, "lambda_mult": 0.6}
    )
)

In [50]:
# 관련 문서를 검색합니다.
docs = compression_retriever.invoke("langchain.text_splitter Python")

# 관련 문서를 검색
for doc in docs:
    print(doc.page_content)
    print("=========================================================")

LangChain 0.0.20, Python 3.9, LangChain 0.0.224
from itertools import chain


## Retriever 변환

In [None]:
from langchain.tools.retriever import create_retriever_tool

In [None]:
# retriever_tool = create_retriever_tool(
#     compression_retriever,
#     name="vector_search",  # 도구의 이름을 입력합니다.
#     description="use this tool to search information from the vector document",  # 도구에 대한 설명을 자세히 기입해야 합니다!!
# )

In [None]:
# tools = [retriever_tool]

## Agent

In [52]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)

from langchain_core.prompts import PromptTemplate

# 프롬프트 템플릿을 정의합니다.(5개의 질문을 생성하도록 프롬프트를 작성하였습니다)
prompt = PromptTemplate.from_template(
    """
    You are an AI language model assistant.
    Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database.
    By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.
    Your response should be a list of values separated by new lines, eg: `foo\nbar\nbaz\n`

    #ORIGINAL QUESTION:
    {question}

    #Answer in Korean:
    """
)

In [53]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# LLMChain을 생성합니다.
custom_multiquery_chain = (
    {"question": RunnablePassthrough()} | prompt | llm | StrOutputParser()
)

In [54]:
# 질문을 정의합니다.
question = "langchain.text_splitter Python에 대해 알려줘."

# 체인을 실행하여 생성된 다중 쿼리를 확인합니다.
multi_queries = custom_multiquery_chain.invoke(question)

# 결과를 확인합니다.(5개 질문 생성)
multi_queries

'Langchain의 텍스트 분할기 기능에 대해 설명해줘.\nLangchain에서 Python을 사용한 텍스트 분할 방법은?\nPython으로 Langchain의 텍스트 분할기를 사용하는 방법은 무엇인가요?\nLangchain의 텍스트 분할기를 Python으로 구현하는 방법에 대해 알려주세요.\nPython에서 Langchain의 텍스트 분할 기능을 활용하는 방법은?'

In [58]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [59]:
multiquery_retriever = MultiQueryRetriever.from_llm(
    llm=custom_multiquery_chain, retriever=compression_retriever
)

In [None]:
# 결과
relevant_docs = multiquery_retriever.invoke(question)

# 검색된 고유한 문서의 개수를 반환합니다.




검색된 문서 개수: 3
Python


In [67]:
print(
    f"===============\n검색된 문서 개수: {len(relevant_docs)}",
    end="\n===============\n",
)
# 검색된 문서의 내용을 출력합니다.
print(relevant_docs)

검색된 문서 개수: 3
[Document(metadata={'source': 'data/qna.csv', 'row': 126}, page_content='Python'), Document(metadata={'source': 'data/qna.csv', 'row': 1}, page_content='LangChain 0.0.20, Python 3.9, LangChain 0.0.224'), Document(metadata={'source': 'data/qna.csv', 'row': 1242}, page_content='<p>Works out-of-the-box with any Python version 3.7-3.11.</p>\n<p>Fully multi-platform, and uses the OS support to load the dynamic libraries, thus ensuring full compatibility.</p>')]


In [61]:
display_message_tree(relevant_docs)

[93mroot[0][0m:
    [94mroot[0][0m: page_content='Python' metadata={'source': 'data/qna.csv', 'row': 126}
[93mroot[1][0m:
    [94mroot[1][0m: page_content='LangChain 0.0.20, Python 3.9, LangChain 0.0.224' metadata={'source': 'data/qna.csv', 'row': 1}
[93mroot[2][0m:
    [94mroot[2][0m: page_content='<p>Works out-of-the-box with any Python version 3.7-3.11.</p>
<p>Fully multi-platform, and uses the OS support to load the dynamic libraries, thus ensuring full compatibility.</p>' metadata={'source': 'data/qna.csv', 'row': 1242}
