In [None]:
from langchain_openai import ChatOpenAI
import os, openai, getpass, tiktoken
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
#==========================================================================================#
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever


In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='../data/dur.csv')
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 250, chunk_overlap = 100, length_function=tiktoken_len)
texts = text_splitter.split_documents(data)

In [None]:
model_name = "jhgan/ko-sbert-nli"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
#save to disk
#emed_db = Chroma.from_documents(texts, hf,persist_directory="../db/dur_jhgan_250")

In [None]:
#벡터db의 데이터가 in-memory가 아니라 persistent storage인 disk에 저장되게 선언
#emed_db.persist()
#emed_db = None

In [None]:
# load from disk
emed_db = Chroma(persist_directory="../db/dur_jhgan_250",embedding_function=hf)

In [None]:
openai = ChatOpenAI(model = "gpt-3.5-turbo",temperature=0)

vector_retriever = emed_db.as_retriever(search_type="similarity", search_kwargs={'k':3})

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(data)
bm25_retriever.k =  3

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever], weights=[0.7, 0.3])

In [None]:
# ensemble_retriever.invoke("타세놀")

In [None]:
# query = "타세놀 노인이 먹어도 돼?"
# docs = ensemble_retriever.get_relevant_documents(query)
# docs

In [None]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human","{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    openai, ensemble_retriever, contextualize_q_prompt
)

#답변 생성
qa_system_prompt = """You are an assistant for question-answering tasks. \
ONLY USE the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(openai, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever,question_answer_chain)

# chat history 관리
store = {}

def get_session_history(session_id : str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key= "input",
    history_messages_key= "chat_history",
    output_messages_key= "answer",
)

In [None]:
# id = "0"
# query1 = "타세놀 노인이 복용해도 돼??"
# try: 
#     result = conversational_rag_chain.invoke(
#         {"input": query1},
#         config={
#             "configurable": {"session_id":id}
#         },
#     )#["answer"]
#     print(result)
# except Exception as e:
#     print("오류 발생\n사유 : ", e)

In [None]:
import time
def measure_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{func.__name__} 실행 시간: {elapsed_time} 초")
        return result
    return wrapper

@measure_time
def invoke(id, input):
    response = conversational_rag_chain.invoke(
        {"input": input},
        config={
            "configurable": {"session_id":id}
        },
    )#["answer"]

    return response

In [None]:
test_data = pd.read_excel("../data/qa_dataset_drop_company.xlsx")

answer = [] # llm의 응답을 저장할 리스트
docs = [] # retrieve한 문서 내용 저장한 리스트
for i, row in test_data.iterrows():
    try:
        response = invoke(str(i), row.question)
        answer.append(response["answer"])
        docs.append(response)
        print(i+1,'번째 답변 : ',answer[i])
    except Exception as e:
        print(i+1,'번째 답변 오류. 오류 사유 : ', e)

In [None]:
question = test_data['question']
test_output = pd.DataFrame({"question": question,
                            "answer" : answer,
                            "docs" : docs})
test_output.to_excel("../result/dur_1차_jhgan.xlsx",index = False)