실습 #1: langchain을 이용하여 사용자가 업로드한 pdf 파일을 근거로 답변하는 ChatGPT 어플리케이션을 작성하시오.

In [1]:
pip install langchain langchain-openai openai faiss-cpu pypdf tiktoken

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 KB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1 pypdf-5.8.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI



# PDF 문서 로딩
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

# 문서 나누기
def split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    return splitter.split_documents(documents)

#  벡터 저장소 만들기
def create_vectorstore(splits):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(splits, embeddings)
    return vectorstore

# QA 체인 생성
def create_qa_chain(vectorstore):
    retriever = vectorstore.as_retriever(search_type="similarity", k=3)
    llm = ChatOpenAI(model="gpt-4o-mini")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

# 실행 예시
def chat_with_pdf(pdf_path, question):
    docs = load_pdf(pdf_path)
    splits = split_documents(docs)
    vectorstore = create_vectorstore(splits)
    qa_chain = create_qa_chain(vectorstore)

    result = qa_chain.invoke({"query": question})
    return result["result"]

# 예시 실행
if __name__ == "__main__":
    pdf_path = "sample.pdf"  # 여기에 본인의 PDF 파일 경로 지정
    question = "이 문서의 핵심 주제는 무엇인가요?"
    answer = chat_with_pdf(pdf_path, question)
    print("답변:", answer)