In [None]:
import fitz
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# PDF 파일 경로
pdf_path = "/home/ubuntu/work/kosa-chatgpt-2025-1st/src/exercise/kwakbh/day03/소나기.pdf"

pdf_text = extract_text_from_pdf(pdf_path)

# 텍스트를 임베딩하고 저장.
embeddings = OpenAIEmbeddings()  
vector_db_path = "/home/ubuntu/work/kosa-chatgpt-2025-1st/src/exercise/kwakbh/day03/sonagi/"
vector_db = FAISS.from_texts([pdf_text], embeddings)
vector_db.save_local(vector_db_path)

user_input = input("질문할 내용을 입력하세요: ")

# 관련 문서를 검색합니다.
docs = vector_db.similarity_search(user_input, k=5)  # 유사한 상위 5개의 문서를 검색

documents_text = "\n".join([doc.page_content for doc in docs])

prompt_template = """
다음은 PDF 문서에서 검색된 관련 내용입니다:
{documents_text}

이 정보를 기반으로 사용자의 질문에 답변해주세요: {user_input}
"""

prompt = PromptTemplate(input_variables=["documents_text", "user_input"], template=prompt_template)

# LLM 초기화
chat = ChatOpenAI(model_name='gpt-4o-mini', temperature=0.9)
qa_chain = load_qa_chain(chat, chain_type="stuff")

response = qa_chain.run(input_documents=docs, question=user_input)

print("\n답변:", response)


In [None]:
#실습 #2: 만든 ChatGPT 어플리케이션을 gradio 인터페이스로 수정하시오. 
# 파일을 VectorDB로 변환하는 ingest.py와 해당 VectorDB를 사용하는 ask_pdf.py 두개 파일로 생성하시오. 
# 스트리밍을 사용하시오. (시간이 된다면) 파일 업로드 기능을 추가한다.

In [None]:
import fitz
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
import gradio as gr
import os

# PDF에서 텍스트 추출 함수
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# PDF 파일을 처리하고 질문에 답변하는 함수
def answer_question(pdf_file, user_input):
    pdf_text = extract_text_from_pdf(pdf_file.name)
    
    # 텍스트를 임베딩하고 벡터 DB에 저장
    embeddings = OpenAIEmbeddings()  
    vector_db_path = "/tmp/vector_db"  # 임시 디렉토리 사용
    vector_db = FAISS.from_texts([pdf_text], embeddings)
    vector_db.save_local(vector_db_path)

    # 관련 문서를 검색
    docs = vector_db.similarity_search(user_input, k=5)  # 유사한 상위 5개의 문서 검색

    documents_text = "\n".join([doc.page_content for doc in docs])

    prompt_template = """
    다음은 PDF 문서에서 검색된 관련 내용입니다:
    {documents_text}

    이 정보를 기반으로 사용자의 질문에 답변해주세요: {user_input}
    """

    prompt = PromptTemplate(input_variables=["documents_text", "user_input"], template=prompt_template)

    chat = ChatOpenAI(model_name='gpt-4o-mini', temperature=0.9)
    qa_chain = load_qa_chain(chat, chain_type="stuff")

    response = qa_chain.run(input_documents=docs, question=user_input)
    return response

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="질문을 입력하세요.", label="질문")
    pdf_file = gr.File(label="PDF 파일 업로드")
    clear = gr.ClearButton([msg, chatbot])

    def chat_function(history, pdf_file, user_input):
        response = answer_question(pdf_file, user_input)
        history.append((user_input, response))
        return history

    msg.submit(chat_function, [chatbot, pdf_file, msg], chatbot)

demo.launch()


In [11]:
!pip install streamlit

Defaulting to user installation because normal site-packages is not writeable
Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cachetools<6,>=4.0
  Downloading cachetools-5.5.1-py3-none-any.whl (9.5 kB)
Collecting pyarrow>=7.0
  Downloading pyarrow-19.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting altair<6,>=4.0
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.2/731.2 KB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting toml<2,>=0.10.1
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.44-py3-none-any.whl 

In [None]:
import openai
import os
import streamlit as st

openai.api_key = os.getenv("OPENAI_API_KEY")

def chatgpt_respond(message, chat_history):
    messages = [{"role": "system", "content": "넌 불친절한 챗봇이야"}]
    for user_msg, bot_msg in chat_history:
        messages.append(
            {"role": "user", "content": user_msg}
            )
        if bot_msg is not None:
            messages.append(
                {"role": "assistant", "content": bot_msg}
                )

    messages.append({"role": "user", "content": message})

    response = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=messages,
        stream=True
        )
    bot_message = ""

    for gen in response:
        delta = getattr(gen.choices[0].delta, 'content', None)
        if delta:
            bot_message += delta
            yield message, chat_history + [(message, bot_message)]
    
    chat_history.append((message, bot_message))
    yield "", chat_history

chat_history = []

st.title("ChatGPT with Streamlit")

message = st.text_input("You:", key="user_message")

if st.button("Send"):
    if message:
        chat_response = st.empty()
        for msg, hist in chatgpt_respond(message, chat_history):
            chat_history = hist
            chat_response.text_area("Chat History", value="\n".join([f"User: {m}\nBot: {b}" for m, b in chat_history]))
        st.text_input("You:", value="", key="user_message")
    
if st.button("Clear Chat"):
    chat_history = []
    st.experimental_rerun()


2025-01-22 15:16:32.275 
  command:

    streamlit run /home/ubuntu/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-01-22 15:16:32.281 Session state does not function when running a script without `streamlit run`
